From 5689b7a8af2350dbabf946a7909d28b48d1884c1 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Thu, 23 May 2024 15:10:41 +0000 Subject: [PATCH] Add changes for 0fc4ad20be7d74ce41da7c61dcc69f76bfca3882 --- latest/design.html | 9 ++++++++- latest/genindex.html | 2 ++ latest/objects.inv | Bin 7849 -> 7864 bytes latest/searchindex.js | 2 +- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/latest/design.html b/latest/design.html index c11576881..a803d1468 100644 --- a/latest/design.html +++ b/latest/design.html @@ -141,6 +141,7 @@
  • StopCriterionReached
  • check_argument_list()
  • check_argument_type()
  • +
  • check_restriction()
  • check_restrictions()
  • check_stop_criterion()
  • check_thread_block_dimensions()
  • @@ -1689,10 +1690,16 @@

    Util Functions +
    +kernel_tuner.util.check_restriction(restrict, params: dict) bool
    +

    Check whether a configuration meets a search space restriction.

    +
    +
    kernel_tuner.util.check_restrictions(restrictions, params: dict, verbose: bool) bool
    -

    Check whether a specific configuration meets the search space restrictions.

    +

    Check whether a configuration meets the search space restrictions.

    diff --git a/latest/genindex.html b/latest/genindex.html index d3b78ab8d..43f57535a 100644 --- a/latest/genindex.html +++ b/latest/genindex.html @@ -191,6 +191,8 @@

    C

  • check_argument_type() (in module kernel_tuner.util)
  • check_kernel_output() (kernel_tuner.core.DeviceInterface method) +
  • +
  • check_restriction() (in module kernel_tuner.util)
  • check_restrictions() (in module kernel_tuner.util)
  • diff --git a/latest/objects.inv b/latest/objects.inv index 7a785d36f3308c130eda79c70b9bcfdcd4139ea0..bbac934966d45c97a7ed7e290a627c3ecdec7eef 100644 GIT binary patch delta 6186 zcmV+_7}e*gJ-9uv#s+_FzhK}HW*`KgWl?QC3j|r7*zcCbeLK)EH61E66N6J)_Lq7y z@Dj&t)f5dUU!M|hl?8gq9r zjBQp0hKDRgAh5WkB{a5@(SJ0++)?dq%1N~LJ_h@;gSDT^VR5WsDQO{%b9#V1?Jztu zH}_165^mG=hZgE~nuph7*N#P^es@NCPBt~BoCtE9jT3(_0lQ&UWo5cpB!-yY19Lf> zHVw0AN5*#|AD@4)rnxG}Mbit{mX&Zz!zXs9nNItuB~@Am1Z6OCHtrT%AI-qW-u|315})}+GOPx70t zakG7Ks9t3(80jhmp^9Rpz}K0I6>+ z+pw-zMSo+Tt=&$a0K>$)u$kxebjzNQPx60m1=)LEU}|ISd#16_Aj98)Cj6)Pmho>K z`ekT&zh9d1wZA{1?bX|@v=tbXZ~jr|N?}gEiZ_3ePRyHsyvuJ5O8#je zrzsWCV@ktXV*@8IdDnT6-yg;If~@47ZEB;VPiN)<*^8ZAnH219>Vrj#m1nX%yOx_8 zrc!K3=$FgW>?f(CU9C)%9xfJZlIPn!4d6^3`dTSR)n1VO#lr8VDvdZd%J=+5k#sj; zSF?Y&EJKqMc_n0_u7-U+I{B;SjhF?SH`^5O*@|Tl71PJFeg62YL7%*#3MTxZhn)5V z-r|^i+oo*)@UX93v^E!w6uU!O2=8@#<^RF?o)JQ~Itt+T)Qs#OnB7S~Y%h}lW@0yh zoc#x8XT%8Kd?0|OSfVTcfO%R_VpatQa20=R=j0zSR|`taay>g&wm>BpK4*71fdq}J z-qkO5Dso11ex1avd=!;|%tDZ2=a0wn-e+Q?c^{FyybqJW4ZN-XHrmxkY{tVReQs{rW0 z48WuT?h`T~N>CfX;lA@s4uB5O07O{tJ_Q7#6kr4U(JSV+HSbX);NRAK%)o!)yZtPa zNO=fpVBl^&GbK(O#yC;P9&uO;765D~K3ESvH3p{+wlG1M9)5ZaQXgU@GC=p9o&!^Z zSb_-6od@>Bt034zf^z5iK=CR-_JDBQb`nvPG=y$~Ks@N|Be*KaDsTwyI&mpl1411U zd^^u~idTWLhlbs*)1;y`K-7QX0k`w)s(6(E_HzDK)YPV+BeGD_#X?wugKzg)TmT)Y z0l+}qdG=Sl3Y>kSpgi*Mu@oREoACj9)M;pV2$Z!6g7vu5*%Dwt4Mqm;L8rvQRiRcP zf_U!%bucxgC6IvLdoCVK4Pgl&xObhz7p(!Mo*>W^+^!tBKAOvYa%|Ijo z-g6IOYH&-S$$;;)MUWQQ40vMTJ>wCk7GQ~7HW9f}Bk({;kSJ-0KDILr4AOFZ>JWLBXbKRd%?T2S*z--3puo*WrV)W>tzwknwjq*< zfTLcaifDTvDMi33vrvCUls$mN!hNg7hNdWL+!(!cK6K;Xm6Ks0TL&k!l2- zDhyS`*aJ;C{O1`1^$^Cu(~f|Xl%a|N_Q+MxkqfT_4{k;-ybj|EPCb05M1!>8W&l$U z&&kvfC5#P;Qjh54vD3gHEyt%Gk!NJfORIYt?78zS`x zI8q&|h_(lkdIX&24pl_i14upGXUGF|;07c}Jt9u9#{wYi1g9Q0cD~fZs|Pm*k$PBI z`BM+SBE}wQ>S1N&NjhmDmt_3$g&?crM;Med{I*d+LQBw;((17(vL5vmwz4?7A&*{~X8$Pp@9Ba(l5e4=e6Va=|6(vfu8`5N+c z3305!c4#z$yA>0z8fg_g88x!CchIMOCK zke1yH!_J|xo2`cLotk2^)o|XixlRBdOX?G&i8G1*NoWm2gD3^qu#rv$$j!V}8bUV}y&wvXrGX zAQ>uoVi$(60S=@UV@6j#lPiSm2L=VU#EuA6jIn2~N}Aq2OhYG6!vx!LMv7I4;?2+g z66ecCN2kGXAT6*N*qhi02GEHzpfAT_8H%2ilX8FfCD;Tr97rq1jI|e|ksTvm<$xAB7Eyp85b1yMOM>F1k;271L9}`giFO#CS!VD|iV`%k%m9IM9&-IKb<}^7 zze&LlrF|{7d5|##?I01dKB8=K)=&rkn8THRAAgogj=Io>FLATf^hPemS!re&4NQwQPz_OJXUPk>eF(0$%W*kczfwL}Kr>K$j zant=p>iR`|Y?fi%Up&*nS%ok46U|Zm+fDyZsUIfhXGWhW(f7To=B)KJzYKrk#beRG zOiLx_-&IdVRmHGzpu`A!xPDh5YUAI3k90@pZ| zXbs63EOy*YK+I;*z2?;hyS^gxasQ~HFiv^E-USD_&&)V=|AFS=VP$#ar5RuQLjpAOpp7TTYdB`5-U-6$ z9qQxj@S>`iWeuG;lWXnah6^@*jjeFUv$( zCd!nC3b}8I>n7V|_`UXNWxZ0i|IB`whAtbQUS}3PFGI#|rHZdiY#2%@pW{bGb;{5H50cckWM{*&ejW&2-lhojNe{LZFs@;|c9 z;D^?kuqkdluXB4NO%mgi&P~ES(}`4S|70Lr<>&d(4nntF{5F3RGnk#g5$?~>4#Wu> zSa@CGxoM9>G&Bc}Sj*EY~6o*RqsjHYJs{?S0`+j~iaDd9nx zf%0dAvha|iEb9V1qoly@Ej9%pzb>50n?5h|aiZBHPo(0;)8{ByL7UynOgdB6d^LSm zW>;x3D#_Q9-4A~=OA>WTdkuHyssqe^mQ}j?4I8=nBcT!{y{Znhqa+`>ziwYosQt^( zu4fN$guIH~UO(HxayhptA72A;RSl)Mkj0nr)zMmbD>}Lxl_vhA?B)n`MMJeg7N1)q zU&p#}j*VU9=da^CK3P%&P@mKJR^x&*D})Tdu3V<(Yun>w7e@=?DE{keTQz|3T1SJ*Q)R4MWCJO*a| z<6$m)>xGW>VT$PG5cQ8-X6yRDxQzP>%nOZlk1y*E4NH-a;_m@K5-IE2g`r4ERzKGD zoA21W`X7I4Q%EdpxYzvQUG^(`_fHLEPcj}(>Y=2HbMpgz;?nOdLSs)}9-rF7T&1C# zAKvA^N=^27<(gVl8_A?4%YV#yH@p!3+%^T9YV*IH#91S}G-IXwfxd;8hKBf#Ra;{e)E$4sMV6TsTwiu{u{-!a6ZRNJw%4fIU z^+CQ@XX;w)r(s^$bM*Y=-fLtU##5Pp(Lxy#s+u=4tzu<{&{Kmt%Z6by6Z^Hhr|x`s z*d9Ou?U?mso2+9*Wi~uyvhvM$O<$9*=PnV&qCXDlTviX&s^kMZ!cEnf5|1m?Rp7Qc zYfFDOlCSe=uCD^2OUdV2#=ogx|F<^PbIzwHwyhORm--*}EtB6=uw(X|Ydv5^_o(^% zl@l>t8&Scp&U^9YtaHPre&=OR?-9PSnjPaLdHea@j~V?=(=ql@hpov-sEzdbK<4~` zdV22c)}}rNDOwXQt#?XYM(g-c6?J;kPtAWQY0^)Q#&o%Vy}y&j&`&zx>Bj&KhSl1; zSAJ&qolJVivU;Y&(A5nNrT2T)mIdF*Dr>vE51WNX2-9pBh~{>WN=m!D-fsLux^IUDH84@c}9!#Ar2QmH@4dt%_LX^OkOysed*}xR@TGz=CWI&Cw zU?cVMriz{H65A+2p^AZ4vAyp2f0*Y92l@T455M`ICnv!`mSeZDe;AGo;_kyIAH#!K zD8Jzg{X_bBnAFcGz6#_W`}}{AfA;css~znLzVe!&>k5WmCRNVS;jz7Gk`L$y84{AO zq>{fzU4L%mZuE9qC|ELz?kkzu55^byb1EtJ<#@GRk5M<(>1lr&VMb^JXatbi z*TskX2rB0BbY23)U^_HI7i8o(?HkghG|&tS)B_oYyTLS+FtHex)5=yc3{ypLJdZe` zf)QNV{E>pZ3@D;HNRU(v#kFjUvhPXRVqL2VDnd4b^g}a|hxWB7+s9#|kEL!afl@yM z{vrXRHBy*y5t9l@!FGS2?&oGnZ{&3hG?fBCulcl&?6qq_qhtCU=!$9@=6N{B`oUI3wvq-0 z)eGIa(8_f7O1xMFlnv}xh?Yi317E(M{u)~Fu>dHWG54MYElz(N$NtOk?Jk!I9Hosi z^&9#Yj6+H5MZS>c@oAy?W%V>PRlilKsxsQGfREQ9St$YEArn=#HB>*I^?;f!<6{G-}c&ghANF>i4#wP$^ zsJ(N<^3=h?#Ns)h-e(|bpYehvb=ojf&p6>s@*lUdtjuKDv6<~#ZQc~)(8yWZ&=u=U zro0@Rb2siaycd7Gd!N127u2b$H}tP-8J`Ab5cYq;>*ocFbIHZ@%7TtdI%a2AdjZ;h z5f>xd>CO%swxgWYQ=9D#imgD66O|op>RR?HSIhipsz_T^9b3hl?_P$gyr(XBdOZ(D zrH&8ByKU}I_Sar>L^v<|IG}I=Ba*gUj-BgCmA5iMU|$Yr*yEKK@$Eb-c=xyt>;_W$3OzKLx`Rb6GH(d?b5)qANn6x#PH6`v2D z%!0d=r~FF4Vcr^iAay{b9Y$7=k^ZHbhMs?a_FMu*Emzgvnc?2(TWW26W!p^kxYwIe z3A{O^c_Sio8@Mgsw5?zycF3MXAzR42wdm4poiAuVY|0=@w-xC0;~TM2gUT&xUyp4+ z0~NrtEyIvtK;0K$SGFyxw`?MytQmf5R4{H1;MHO_dY0=ei@A-<^^HqFRa<|f z*D4#&CD2wzzTFpcsWENNt`!$uSqSX7jCKjpjZBUu9`soT*FO zrt!a3HS{d$+C=dDdsWh#ti83u>{E{OYbfCkG2WU34CO+2u8`ckK)X=aoK5WwUs^a5 zwjui=;o@aN{-dbBV_Scy*_niQq<12F;-SiW(=c7jmc_g^Xl_qVwK-q9a=2hpD=4n~ Ie=gJ)c7gQ*G5`Po delta 6171 zcmV+$806=;J*hpg#s+^yr{@X;pCeIi{Q?A8p4hLB#eF-_FEzc;G>?K)TJ~3RGq8i@ z5A(S0;iI(RcN#>QMyiYWL4PV)e5{ZGz)cgW*qbua7Zn@2@epfW7KC}6g7x^8kUk4(kP_|*wYTfGjns# zq$uGwU4JH_Zl`&8Eq3i#BH ziFRasC-U(LYnp$nf?PDcaBW%1hi@@d)k>c`)Z4_<7e63~`-{DvXTC2Ug#R)N>x5+x zkM5#>nU+e+WnSukw&gudO7ics>sm`Hoc$!f=^8iN z7l-Oq#)83~^LYr6E_nO--H!?+?{jXl!4|cWn{&&r5HEi?sowDKIsfj4(Ij=qfxVhv zOjNcS&f^nR`M1aeKe5Y~)kDQ&Y@Vp;I5!JzOI+)mU+Y{Gb~NMD3}G=f_jG8{jMr$< zUMqTcFQMR%Yg3am#?AfOv^D+WvqFQmY5gRNeavsFc4d>Da(=h_8q_4ht#Ci0HFkaigPOjTvpssWJt*0K%j zdR6o{_SxF)^a(IbybGIoUQf5|3Hc=d=T?xt=LM!V*1l&N3k@>-4QRrDifcVc}8>(Q!4|>RHPv9+% z$+u0$_74yHxmY~`b<3}hC96g&Gnj`u!~8qNENaM!!tWr@i%1GPfegO&Xl88`6oIS^d`04CAB4Yt?cB*SQ)7hw^Vw8WA z*v7XfNNyWP#VDg~W4k0IHH!fP^>D_R*6F6R%!Hl-1ZneiAzngLub>EBoMGzr73nNf zQ8CI`+xA^FQku4dB6N|4DO-6Yv`j(8D2Ld_GWn>N1_4{ABVHQ#`N1Px$Nucm8Zhe6 zXwSYEe5?YLw{2&U6qeYq@el}W4f}tcB(l|jj0Hg0Y1)A$k)bANGy=?2Y+B+!B><`j zvj>xYxDO|QD4}csr5Wz?5Fkom8zAX~_4I{Ec^K&wX+zkV4;W>jZSeHKdD;X(2W9{! z4RD`t0a1e501o$^Cu#t6fCeDKdiNO~5TyVc*pFT@zpZ%>3IYGN=3@p9-|c@VkwnTv zNCN|R>xn0E;xNXELiUKGRW!$Y8~O%SZdovoGt18Oiba1T1;4Xz5c3K7J6 zkC%g~AuWLf^xjkLU}^|U0KvWM9KC1_DD?z^e((u?ICZdv;Go}kejtBH3u*=;0q~w$ z2vdVw0!;>dXC;EPz-GV`1MdlqFtq?nmL-e7WX<(3+<5P#olR#5|AZ<>NK*XL-ngj)IHZqL}JSi2U47UxDOavU% z3ROhg14$_Y&WMF7qU?VGBo^+IX#qNL0}`Yc5$ED!0T6bAlMMgKyg)s;F^E(n;7nks zBE}wQ!r?!S7^sIZ2A*~VoP!Ki46sM8f{t8x9eC_Aa^ZCtS8(d#JL4Io1vdkjdU(#2 zhA3fdNR)a+A8wrn25C7y^@u!iI|T^R<^-un>?z<$P~c`GQ;&bZ6Ui~kaN7{6N5FyT zP(`#okkljKEOw|O${s-K;XVN#paVA`LFy54{yY`{VJA5Cu(9){9$r1TF^JT|cBH*G z_3$fV?182pR#u+W!>fld2A+D@Sb0+qzoOk9zSU9WK1z;Vl0f??IjF>@6UI}=%{hcq zG?L~0b~G}E<`sV=LKaGt1>8&1ufsYg;jH4p)WDW(2a+T)V*>-|L>eHDx{z{mhjN4o zCOD8*tQqpCji|?~nTw7Ts}Ri_mmFId1!cr>x)8&_xv!m?FX4^RgqQ?RZSBmw`D~PG ztW`9`UG_r~wsSpDHklEjijnrPqcD^Wt1*Tgp`tY+smFgO+C~!A?Aj+CNtd0kAy1bO z#~N%$J|nnWG2yC_R>6}|BP(wP8mbj*Mr^KW=3z-NJ;IS(XL{H4FI`*MHt&ujZGr=7 z+08KQ92&dXYWUu%DK=XT=N+5t1n{w>J~5g&ljxs>)-W`PQh*Kn?z5NJJ#g(v$1QEM zSOpYs@Nj>!^YIO*08N}pfZ^oeWFJm$CA$sE>83O}>8L)7ONKG#XIwW%xM(a(SxN(v zp^_(dVF(-GKw2?obmcR-Ldbq#P+&{!h)~5Cd*-U7>FvWbbn-MzupMWlScNFw{Om7r zI&5@w8Vm>00-J%oiH%?YohSqPax9ji=t(&#hhKk!O)$fOv|`Lydodc>F>(hquq`f3 zxT@VM>)F9!23XaDq~d7t%4k-ac8E|Ar^AuS(X0;oVPZkdY{!~L zQ?vm^%7>9Y9`_v2_J|NF=;L=i<^6%5Tpr~?od?ASO9oLS@cYE1A13ByrktN0#YoX} zSNne%xDMft~$og2GYND-l3NKQ2C!jIJCY z6F}&A`g|m%qk5230IlPJ_K~!X>OoQgwDf9IbO_yHDQRKT(GeI=Qd16)@epcy z9@51-@ApE{`UKJHJtW#;cxIWwGfnS7Bg+gBDCZ&94^u}i`I{8{4A|FVKZ!nqpdEiC zLe@u=t%Vxu;NMQT((mKXQpr&l+VCZAQkdSz#W)GfEaM?WQ_UHkn;Eo2gnXPPLUjoI z31lznj@fgsHo$y?hw|I0{_jRH8K`Cydx}HvutuME9Ck z8|?au%*XwshQc`I0b2ze}3CC-%CFBjB!3OV!~ zklsN~7Pm)*U++F=dtpY0B5RZup4-~LI~&;h&()Jn{k;=})jQP3*X4=QZ@RH1><8C! z@v*uyYKYQIFFiNMJSgii)?G9ep^4G#Z0w^!UY7CRm1XCK=&)sa^csKWEDv)w@=3_j zE0y0eS@As+m$N2kaQd0bi{%QrnAq=Z;4_#1DC9o~d0v)@vP_gI4Ha^m5Z6t%$?$va z)5?0KZ2y`4G7VieKE2K?dR~T%-A*+(M%-Vstr_r5sS-(iZ|VdiZc&XBA9e-2m&4%5ZT9e_o=%K0Yw#eBOwOrm7xftj4W@H@X zZ4pFchx^3{$M|h2Rjp z2B1Et^R31OXI2OqfL*yv&DXr6;b$|pO;!2^jalIXFU@~GXr+gy#-ZsKb_0$>`}*i^ z>%$b$%OUC? zxy;t}e{mW26_^(q=^kI!9U7J*AI0ATfFx4ZwF^U$lB|BL>o?!AclAHirjS_HaIg8p zyX;r??w@}e$ev_8oYX@}73by$`oyK*S%k)(ygWX&hq+2aH$S|~f0df-@ya!|s5X*G zOP2qb^KN({{JCukHr3{TJBhPKcxlE;`2&3mFAWXx4WhQdwDPy4$=~ix@!~C%y*;(X z`cO$7?2UXn2(s@Sb6Y*F!CoKxZ0$|g{7qvB+d_Y7wUy6qz3YQ~ug=t^&rid=u;=Lc z$-UReG>oS*|DuI5BvdtTWLm|_456n6b(RgoW+wJ)cTe5<@UT6A0@^X_$u?QXh{|ku z$YkZ4@0z|QU(a14iba1M(z&c2s#VDcc7&U%F(n>XsH?zjbJmt_Bwy##Tweu3my*x5 zjDLSq!TxV;s^^?fPi$K&m@f4{>{}+ksbI(KIoEo?itbVK_bVr2x;CPMU!C{j%US1! zP5sWxp57yTA2U0~N%HpdyB{<9ou*^#qYhh>kx(1y^MTCy1NHRW*{w}|3{tcvTw3px zx{TKGp(^V1rk|Qo(xjgpjp=g#dVeR4p`U+rz|)Tb8Vswocdz`+?mLshoP$* z97^x^sx1q?lU3Gsc^@_ljS!~UFc8h{9+i}KdA~PHCad&z#fvi8ui}fNL6fxD!&_bp z@#WA2B{1cDG;W*lxtcIY|G~<{`29(i=}m}Ucc4lEaS7ghCt|Qd557!Mj9HN1qB4I* z>kh%01DakhwsjDV0ly5>b3PJwzp%+Hp>5v0w*hPw8LP->MJ6iTd;$^$2bmA9tRbdb z`m&)B6Tjr9ki#-_;?EK`}D#WmsGtSuk{Kx(O(pZ0=8Zm3yX9uey@Y)aCtTa_ke&+6D*r6Rr7KIRs z^1=o+bC#8tPz4|EXEHGtdhWk#74N}6wyy`JbGPY=yxyv;O|7_A?fWWjMa~v1AU|y& zTkBbBYqUr8l@jk^Ba$loe6`<)Gk zA;>^;mr{SISK6x~-cvmcBv^5;}i?91_LxgMi#s?*an!i>-c&aAz7G>X)vcN&lw8ahrLWi7_3SkU6c zBiOTGXlW`1fL`-y9ocKwf=0*mInWi=G|cmGj`f4Bifknf45}Bpb)l8%?3H-23Md=c zuMjPbkOsbdKm9ed;9~($He>ER3tF5wj{TS6+g&abI7%C3>NkJ%Ef|NA){A^0&EwNT z^ULaKXsUz*^9T&qj4^K_q<$@$cE;Uh2be&)c%oezqFo@-E}m%D^n6Mkilz<{Bo#vu z*@@oX19aOk)r6hVmFPO#;3DNC3>arzioEOl1R^+1WnT|nG4rW^D0T*0i(@k#(lUf(@pX(~toGVxTm7m!GvJB&{Nz)*YVh~=q+g^9&;KE2OC z(mvw_OU9Ew1=~pSXS`rZ0Qo)K# z2kNwQXpLlO01Ow6=TGBqBw>c&sEenhh1f`PI#!TWJYhqChNNvmjF2^$pcdq|tFn{zKp><|aJ10dV-oLwygk871!ZzsRj*Nq%B(Hs@~KYj`jIc=tYg zr!S~eRd48D*D^j0%pmN8*N^8F$AF9Jl?5G_bj*LwuJ!`7{UR<#w!@SiG;Aj)tA`}p z8x&iC8Ye0{+SIk|Rj!u#(NvMPsyeocH{ZPsRe4Wc@br2fj7l9Jj(6MKpX{%_=7?}! z_HjVr0!Aclxg0x2lPYgzg228U&alTTFXG#e$~*CGcFU!B(Ho^&#XoB4Is=<{u;>c1 zENFjOF%zDed$HNl0c0ze&F&@izU0eB&&pR*kLt`;BY?Jt%C+%*n~|;Ued83lt4UAh zgVlTSrgN41i|zlvEqxQ)imJNGMx)s~QLFb-Z78(wRVqFoJedV|DNp&8e#5*q_(1A_ zNIQ(IAS3-tGYvie?70MrTCS?SGsC^nx72^y`pUMM>T$0(qY`*?Nb^QS<~DF!zG+** zNbHb3heEcHd27+7**ag)e%O>jmToK1>Bl!>qXv~*)V?0ueg-OlXIq9VF)1IuXMwse zz^-gtRBzctKv^^V)~I0I9Kfr^Z1gPGR~B;{m+KpsfU34euT?glOQ5Zce7i5?RAqm+ z_5zlT(!|Ix(#F?(tB~3}f0AP^XwBwZ!y3(f2ENMDx;RspwoT)It7_<3(zS`;`S+@% zH(7gYh1sVZ=hsle9b&vS2N=qQ@LVCed4YDJt~s098@{w~CTv6YL&C+&g#1TQf5*1| tP_r`$??~_V#6y+$reV64EsJ?;(A*QAoN9Bvbmeftq*hQ|`TuX8$0n>43ts>L diff --git a/latest/searchindex.js b/latest/searchindex.js index bf6044f73..0766bfe84 100644 --- a/latest/searchindex.js +++ b/latest/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 12, 14, 16, 17, 18, 19, 20, 22, 23], "tuner": [0, 1, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "implement": [0, 5, 6, 10, 11, 16, 17, 18, 22], "multipl": [0, 2, 6, 12, 17, 21, 22], "one": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "opencl": [0, 3, 4, 7, 8, 9, 10, 12, 13, 15, 22], "hip": [0, 3, 13, 22], "gener": [0, 3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 20, 22, 23], "select": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, 20, 22], "case": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 19, 20, 22], "automat": [0, 3, 4, 7, 8, 9, 11, 12, 15, 21, 22], "done": [0, 4, 14, 16, 17], "base": [0, 3, 6, 16, 17, 21, 22], "": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22], "program": [0, 3, 5, 7, 8, 9, 12, 15, 20, 21], "languag": [0, 6, 9, 12, 15, 20, 22], "sometim": [0, 3, 7, 8, 9, 20], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23], "ll": [0, 4, 7, 8, 9, 14, 15], "want": [0, 5, 9, 11, 12, 14, 15, 17, 19, 22, 23], "specif": [0, 4, 6, 7, 8, 9, 10, 11, 16, 17, 18, 22], "choos": [0, 7, 8, 9, 15, 18, 22], "pycuda": [0, 3, 7, 9, 11, 12, 17, 21], "default": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 21, 22], "It": [0, 3, 4, 6, 7, 8, 9, 12, 14, 15, 17, 21, 22], "compar": [0, 4, 5, 7, 8, 9, 11, 15, 16, 17], "complet": [0, 1, 4], "cupi": [0, 3, 12, 14, 17, 21, 22], "becaus": [0, 4, 5, 7, 8, 9, 12, 14, 15, 16, 21, 23], "ident": 0, "includ": [0, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "here": [0, 4, 10, 11, 12, 14, 15, 17, 22], "well": [0, 7, 8, 9, 11, 15, 17, 22], "To": [0, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22], "us": [0, 1, 2, 3, 4, 5, 6, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23], "nvidia": [0, 3, 6, 14, 15, 17, 21], "gpu": [0, 3, 4, 5, 6, 10, 12, 13, 15, 17, 19, 20, 22, 23], "see": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 19, 21, 22], "http": [0, 3, 6, 13, 14, 17], "github": [0, 3, 4, 7, 8, 9, 11, 14, 15], "com": [0, 3, 6, 13, 14], "jatinx": [0, 14], "nv": 0, "while": [0, 1, 4, 6, 7, 8, 9, 10, 15, 17, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 15, 17, 22], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 22], "input": [0, 4, 5, 7, 8, 9, 10, 12, 15, 16, 19, 20, 22], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 19, 22, 23], "numpi": [0, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 19, 20, 21, 22], "arrai": [0, 4, 5, 6, 7, 8, 9, 11, 12, 19, 20, 22], "also": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "argument": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "thi": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "give": [0, 7, 8, 9, 18], "user": [0, 3, 4, 5, 6, 8, 10, 14, 15, 16, 17, 18, 21, 22], "more": [0, 3, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22], "control": [0, 7, 8, 9, 17, 18, 22], "over": [0, 6, 7, 8, 9, 14, 15, 17, 18], "how": [0, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 19, 20, 21, 22], "memori": [0, 4, 6, 10, 12, 17, 20, 22, 23], "handl": [0, 12, 22], "check": [0, 3, 5, 6, 7, 8, 9, 12, 15], "dure": [0, 1, 6, 7, 8, 9, 11, 17, 22], "verif": [0, 2, 10, 22], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "happen": [0, 1, 3, 4, 15, 19], "entir": [0, 3, 6, 7, 8, 9, 15, 18, 22], "when": [0, 1, 3, 4, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18, 20, 22], "textur": [0, 6, 22], "c": [0, 3, 4, 6, 10, 12, 13, 14, 15, 19, 21, 22], "signatur": [0, 4, 6], "With": [0, 11, 12], "other": [0, 1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 22, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21], "ha": [0, 3, 4, 6, 7, 8, 9, 12, 15, 17, 18, 22], "extern": [0, 17, 21], "linkag": [0, 21], "If": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22], "code": [0, 2, 4, 6, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "wrap": [0, 6, 19, 21, 22], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "block": [0, 4, 6, 7, 8, 9, 10, 11, 14, 15, 16, 19, 22, 23], "which": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "mai": [0, 3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "caus": [0, 7, 8, 9], "issu": [0, 20], "contain": [0, 1, 4, 6, 7, 8, 9, 11, 12, 15, 17, 18, 21, 22], "cannot": [0, 3, 7, 8, 9, 17], "have": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "present": [0, 3, 15], "header": [0, 22], "file": [0, 2, 3, 4, 6, 7, 8, 10, 12, 15, 18, 19, 21, 22], "As": [0, 1, 4, 7, 8, 9, 11, 14, 15, 17], "detail": [0, 6, 14, 22], "further": [0, 7, 8, 9, 14, 15], "templat": [0, 2, 11], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "fulli": [0, 3, 14], "limit": [0, 3, 4, 6, 7, 8, 9, 10, 15, 17, 18, 21, 22, 23], "python": [0, 3, 4, 6, 10, 11, 12, 15, 17, 19, 20, 21, 22], "benchmark": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 23], "observ": [0, 2, 6, 16, 22, 23], "constant": [0, 4, 6, 7, 8, 9, 10, 12, 15, 18, 22], "dynam": [0, 6, 22], "share": [0, 4, 6, 22], "anoth": [0, 7, 8, 9, 12, 15, 16, 18, 22], "import": [0, 4, 5, 7, 8, 9, 11, 14, 15, 16, 19, 20, 21], "differ": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 22], "between": [0, 7, 8, 9, 12, 14, 15, 16, 18, 22], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22], "tabl": 0, "below": [0, 3, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20], "list": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22], "packag": [0, 3], "pyhip": [0, 6], "interfac": [0, 4, 5, 12, 14, 17, 18, 20, 22], "lang": [0, 6, 10, 12, 21, 22], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 6, 21], "hiprtc": 0, "A": [1, 3, 4, 6, 13, 14, 15, 17, 18, 22], "veri": [1, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21], "featur": [1, 4, 5, 10, 14, 16, 17, 19, 21, 22], "abil": 1, "store": [1, 3, 4, 6, 9, 15, 17, 19, 22], "result": [1, 3, 4, 5, 6, 9, 11, 15, 16, 17, 18, 19, 22, 23], "tune": [1, 2, 5, 6, 10, 13, 14, 18, 19, 21, 22, 23], "enabl": [1, 17, 18, 20, 21], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 21, 22], "ani": [1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 20, 21, 22, 23], "filenam": [1, 4, 6, 10, 15, 19, 22], "option": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 21, 22, 23], "tune_kernel": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22], "individu": [1, 17, 18], "configur": [1, 4, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22], "append": [1, 6, 14, 22], "run": [1, 4, 5, 6, 7, 8, 11, 12, 14, 15, 17, 18, 22], "allow": [1, 3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 18, 21, 22], "restart": [1, 3, 7, 8, 9, 18], "session": [1, 3, 6, 18], "from": [1, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 17, 18, 20, 21, 22], "exist": [1, 6, 22], "should": [1, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 19, 22], "someth": [1, 4, 7, 8, 9, 15], "termin": [1, 14], "previou": [1, 3, 7, 8, 9, 18, 22], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 22], "had": [1, 4], "quit": [1, 7, 8, 9, 11, 15, 21], "often": [1, 7, 8, 9, 17], "hpc": 1, "environ": [1, 4, 6, 14, 18, 22], "job": 1, "reserv": [1, 8, 23], "out": [1, 3, 4, 5, 11, 14, 15], "number": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23], "simul": [1, 6, 9, 13, 18, 20, 22], "visual": [1, 3, 15], "optim": [1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 15, 16, 17, 22], "strategi": [1, 2, 4, 13, 16, 22], "start": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 22], "call": [1, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 21, 22], "full": [1, 3, 6, 17, 19], "search": [1, 4, 6, 10, 13, 15, 16, 18, 22], "space": [1, 3, 4, 5, 6, 11, 12, 15, 16, 18, 22], "true": [1, 4, 5, 6, 7, 8, 9, 12, 15, 17, 18, 22], "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 19, 20, 22], "even": [1, 3, 7, 8, 9, 12, 15, 18], "work": [1, 3, 4, 6, 7, 8, 9, 14, 16, 18, 21, 22], "still": [1, 3, 5, 15], "new": [1, 3, 6, 7, 8, 9, 18, 22], "come": [1, 6, 7, 8, 9, 15, 17, 21], "thei": [1, 3, 6, 7, 8, 9, 10, 15, 16], "stream": [1, 6, 7, 8, 9], "pleas": [1, 3, 4, 10, 13, 14, 17, 19, 20, 22], "dashboard": [1, 13], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19], "get": [2, 4, 6, 7, 8, 9, 11, 14, 15], "convolut": [2, 5, 12, 15], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "backend": [2, 3, 12, 17], "cach": [2, 3, 6, 7, 8, 9, 14, 15, 18, 22], "correct": [2, 3, 12, 20, 22], "host": [2, 3, 6, 8, 9, 10, 17, 20, 21, 22], "struct": 2, "metric": [2, 4, 6, 10, 15, 22], "object": [2, 4, 5, 6, 7, 8, 9, 18, 22], "api": [2, 4, 6], "paramet": [2, 5, 6, 7, 8, 10, 12, 15, 16, 18, 19, 20, 21, 22], "vocabulari": [2, 17, 19], "design": [2, 3, 7, 8, 9, 17], "contribut": 2, "thank": 3, "consid": [3, 11, 13, 15, 22], "Not": [3, 6], "help": [3, 21], "u": [3, 4, 7, 8, 9], "improv": [3, 6, 7, 8, 9, 15, 18, 22], "about": [3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 19, 22], "problem": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "ensur": [3, 5, 7, 8, 9, 12, 14, 17, 20], "follow": [3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 21, 22], "describ": [3, 4, 6, 12, 17, 20], "what": [3, 4, 5, 6, 7, 8, 9, 12, 15, 17, 19, 20, 21, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "minim": [3, 16, 21, 22], "reproduc": 3, "actual": [3, 4, 5, 6, 7, 8, 9, 11, 15, 21], "error": [3, 4, 5, 6, 12, 15, 21], "print": [3, 4, 6, 7, 8, 9, 11, 15, 22], "version": [3, 4, 15, 17, 22], "cuda": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 17, 19, 20, 21, 22], "compil": [3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 16, 17, 20, 21, 22], "For": [3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 17, 19, 20, 22], "propos": 3, "chang": [3, 11, 17, 22], "addit": [3, 4, 7, 8, 9, 14, 16, 19], "signific": 3, "first": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22], "discuss": [3, 6], "Then": [3, 7, 8, 9, 11, 13, 14, 21], "fork": 3, "repositori": [3, 4, 7, 8, 9, 11, 13, 14, 15], "branch": 3, "per": [3, 4, 6, 7, 8, 9, 11, 16, 17, 22], "pull": 3, "request": [3, 17, 22], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 12, 17], "public": [3, 13], "function": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "up": [3, 4, 6, 7, 8, 9, 14, 15, 19, 22], "date": 3, "written": [3, 21], "unit": [3, 6], "your": [3, 4, 7, 8, 9, 11, 12, 13, 14, 17, 20, 22], "nox": 3, "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "hardwar": [3, 7, 8, 9, 11, 17, 18, 19], "skip": [3, 4, 7, 8, 9, 22], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 11, 12, 17, 19, 22], "better": [3, 7, 8, 9], "entri": [3, 6, 7, 8], "changelog": 3, "md": 3, "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 20, 21, 22], "put": [3, 6, 7, 8, 9], "look": [3, 4, 6, 7, 8, 9, 11, 14, 15, 21], "regard": [3, 6, 18], "step": [3, 7, 8, 9, 14, 15, 16, 18, 21], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 21, 22, 23], "sudo": [3, 14], "access": [3, 4, 7, 8, 9, 11, 17, 20], "e": [3, 14, 16, 17, 18, 22], "g": [3, 14, 16, 17], "devic": [3, 4, 5, 7, 8, 9, 10, 12, 17, 21, 22], "clone": [3, 4, 7, 8, 9, 11, 14, 15], "git": [3, 17], "desir": 3, "locat": [3, 5, 11, 17], "kerneltun": [3, 13], "kernel_tun": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23], "cd": [3, 14], "prepar": [3, 6, 7, 8, 9], "system": [3, 13, 14, 17], "On": [3, 7, 8, 9, 22], "ubuntu": 3, "apt": 3, "updat": [3, 6], "upgrad": 3, "y": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "make": [3, 4, 7, 8, 9, 11, 13, 14, 15, 17, 20, 21], "essenti": [3, 4], "libssl": 3, "dev": [3, 14, 17], "zlib1g": 3, "libbz2": 3, "libreadlin": 3, "libsqlite3": 3, "wget": [3, 14], "curl": [3, 14], "llvm": 3, "libncurses5": 3, "libncursesw5": 3, "xz": 3, "util": [3, 15], "tk": 3, "libffi": 3, "liblzma": 3, "openssl": 3, "pyenv": 3, "linux": [3, 14], "bash": [3, 14], "rememb": [3, 4, 7, 8, 9, 15], "add": [3, 4, 6, 7, 8, 9, 12, 15, 17, 18], "bash_profil": 3, "bashrc": 3, "specifi": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "maco": 3, "brew": 3, "after": [3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 22], "shell": 3, "some": [3, 4, 6, 7, 8, 9, 14, 15, 16, 17, 18, 19, 20, 21, 22], "need": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22], "libgdbm": 3, "libnss3": 3, "lzma": 3, "3": [3, 5, 7, 8, 9, 11, 12, 14, 15, 18, 22], "8": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17], "9": [3, 4, 5, 7, 8, 9, 12], "10": [3, 7, 8, 9, 13, 18], "11": [3, 7, 8, 9], "reason": [3, 4, 6, 20, 22], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 21], "re": [3, 4, 7, 8, 9, 11, 15], "oppos": 3, "just": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15], "so": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 19, 21, 22], "against": [3, 5, 6], "support": [3, 4, 6, 7, 8, 9, 12, 14, 17, 18, 21, 22, 23], "found": [3, 4, 6, 13, 17, 18], "replac": [3, 4, 5, 6, 7, 8, 9, 11, 15, 22], "global": [3, 6, 7, 8, 9, 18], "virtualenv": 3, "virtual": [3, 14], "folder": 3, "whatev": [3, 6, 12, 18], "name": [3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 23], "prefer": [3, 4, 6, 7, 9, 17, 22], "poetri": [3, 14], "ssl": [3, 14], "org": [3, 13, 14], "python3": [3, 14], "sure": [3, 4, 7, 8, 9, 13, 14, 15], "path": [3, 4, 17], "instruct": [3, 7, 8, 9, 10, 14, 15], "end": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 20], "export": 3, "plugin": 3, "self": [3, 6, 17, 18], "non": [3, 5], "depend": [3, 4, 5, 9, 10, 11, 13, 16, 22], "appli": [3, 7, 8, 9], "open": [3, 5, 7, 8, 12, 15], "take": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 21, 22], "effect": [3, 4, 7, 8, 9, 22], "activ": 3, "pip": [3, 4, 7, 8, 13, 14, 15], "point": [3, 4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 22], "project": [3, 14], "extra": [3, 14, 21], "doc": [3, 4, 6, 7, 8, 9, 11, 14, 15], "leav": 3, "doe": [3, 5, 6, 7, 8, 9, 11, 12, 15, 17, 21, 22], "go": [3, 4, 7, 8, 9, 11, 13, 14, 15, 19], "necessari": [3, 5, 6, 7, 8, 9, 22], "conveni": [3, 7, 8, 9, 12, 22], "cuda11x": 3, "cuda12x": 3, "These": [3, 7, 8, 9, 11, 14, 15, 17, 21, 22], "current": [3, 4, 5, 6, 7, 8, 9, 14, 15, 17, 18, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 21, 22], "part": [3, 7, 8, 9, 13, 14, 15, 16, 20, 22], "forget": [3, 11], "correctli": [3, 15], "ld_libary_path": 3, "cpath": 3, "pytest": 3, "except": [3, 6, 10], "been": [3, 4, 6, 7, 8, 9, 12, 15, 18], "left": [3, 6, 7, 8, 9, 11, 16], "gracefulli": 3, "note": [3, 4, 6, 7, 8, 9, 11, 13, 14, 15, 17, 20, 22], "driver": [3, 6, 7, 9, 11], "privileg": [3, 17], "read": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 22], "counter": [3, 17], "energi": [3, 13, 17, 18, 23], "measur": [3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 22, 23], "cat": 3, "proc": 3, "param": [3, 4, 5, 6, 17, 18, 22], "grep": 3, "rmprofilingadminonli": 3, "1": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 22], "without": [3, 7, 8, 9, 11, 12, 17, 18], "conda": 3, "mamba": 3, "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22], "miniconda": [3, 14], "tradit": 3, "under": [3, 4, 13, 22], "quota": 3, "otherwis": [3, 6, 15, 22], "restrict": [3, 6, 10, 15, 21, 22], "disk": 3, "directori": [3, 4, 7, 8, 9, 11, 14, 15], "save": [3, 7, 8], "ad": [3, 7, 8, 9, 12, 22], "condarc": 3, "envs_dir": 3, "both": [3, 7, 8, 9, 10, 15], "via": [3, 18], "usual": [3, 17], "provid": [3, 5, 6, 7, 8, 9, 12, 21, 22], "exit": 3, "enter": [3, 4, 7, 8, 9, 11, 15], "avail": [3, 4, 7, 8, 9, 10, 11, 14, 17], "elsewher": 3, "variabl": [3, 6, 11, 14, 18, 22], "pip_cache_dir": 3, "dir": [3, 14], "xdg_cache_hom": 3, "continu": [3, 4, 6, 7, 8, 9, 14, 17, 18, 22], "n": [3, 5, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21], "forg": 3, "execut": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 22], "config": [3, 6], "auto_activate_bas": 3, "fals": [3, 6, 17, 18, 22], "load": [3, 6], "unload": [3, 6], "rocm": [3, 14, 17], "inform": [3, 4, 6, 7, 8, 9, 13, 17, 18, 19, 22, 23], "like": [3, 4, 6, 7, 8, 9, 10, 11, 15, 18, 19, 20, 21, 22], "keyr": 3, "seemingli": 3, "weird": 3, "known": [3, 15], "m": [3, 7, 8, 9, 11], "disabl": 3, "verifi": [3, 5, 6, 10, 22], "miss": [3, 6, 22], "sync": [3, 20], "dry": 3, "node": [3, 18], "In": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 22, 23], "noxset": 3, "toml": 3, "venvbackend": 3, "2": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 22], "anaconda": 3, "venv": 3, "alreadi": [3, 4, 6, 7, 8, 9, 14, 15, 22], "Be": [3, 7, 8, 9], "adjust": [3, 4], "envdir": 3, "particularli": [3, 4, 16], "diskquota": 3, "isol": [3, 21], "top": [3, 6, 11, 17, 22], "level": [3, 6, 17], "coverag": 3, "gigabyt": 3, "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 19, 21, 22], "tight": 3, "diskspac": 3, "small": [3, 4, 7, 8, 9, 15], "remov": [3, 18], "each": [3, 4, 5, 6, 7, 8, 11, 15, 17, 18, 22], "ran": 3, "longer": [3, 4, 6, 16], "would": [3, 4, 7, 8, 9, 21], "command": [3, 14], "line": [3, 4, 7, 8, 9], "combin": [3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 18, 19, 22], "compat": [3, 6, 14], "involv": 3, "especi": 3, "don": [3, 6, 7, 9, 11, 12, 22], "t": [3, 4, 6, 7, 8, 9, 11, 12, 14, 18, 21, 22], "break": [3, 21], "them": [3, 4, 9, 11, 12, 15], "capabl": [3, 6, 7, 8, 13, 15, 22], "hold": [3, 7, 8, 15, 19, 20, 22], "pyopencl": [3, 6, 8, 17], "invok": 3, "tab": 3, "studio": 3, "id": [3, 6, 17], "seen": [3, 4, 6, 15], "integr": [3, 21], "type": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22], "html": [3, 6], "page": [3, 4, 7, 8, 9, 10, 11, 13, 15, 16], "sourc": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "inspect": [3, 6, 17], "commit": 3, "brows": 3, "through": [3, 6, 7, 8, 9, 11, 13, 16, 17, 18, 22], "least": [3, 6], "those": [3, 4, 10, 14, 17], "pandoc": 3, "mac": 3, "onlin": 3, "built": [3, 17, 18, 20, 22], "action": 3, "correspond": [3, 4, 7, 8, 9, 11, 17, 18, 19], "master": 3, "latest": [3, 14], "last": [3, 6, 20], "releas": [3, 6], "stabl": 3, "publish": [3, 13], "process": [3, 4, 6, 7, 8, 9, 15, 16, 17, 18, 21], "again": [3, 4, 7, 8, 9, 11, 15], "autom": 3, "guid": [4, 7, 15, 16, 19], "meant": 4, "write": [4, 10, 11, 15, 21, 22], "script": [4, 6, 15, 20, 21], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20], "find": [4, 12, 15, 18, 22], "shortli": 4, "much": [4, 7, 8, 9, 11, 17, 21, 22], "reus": [4, 7, 8, 9, 15], "document": [4, 5, 7, 8, 9, 11, 14, 15, 20, 23], "jupyt": [4, 7, 8, 9, 11, 14, 15], "notebook": [4, 7, 8, 9, 11, 14, 15], "tutori": [4, 7, 11, 13, 14, 15], "readi": [4, 6, 7, 8, 9, 11, 15], "oper": [4, 7, 8, 9, 11, 12, 15, 16], "signal": [4, 23], "imag": [4, 7, 8, 9], "main": [4, 6, 11, 17, 19], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 10, 11, 12, 13, 15, 18, 22], "linear": [4, 15, 22], "weight": [4, 18], "filter": [4, 5, 10, 12], "rang": [4, 5, 7, 8, 9, 11, 12, 21], "pixel": 4, "w": [4, 7, 8, 16, 18], "time": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 21, 22, 23], "h": [4, 11, 22], "f": [4, 5, 11, 12, 20], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 7, 8, 9, 11], "equat": [4, 7, 8, 9, 11, 18], "nonumb": [4, 11], "x": [4, 5, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "sum": [4, 5, 6, 15], "limits_": 4, "j": [4, 7, 8, 9, 13, 15], "0": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 20, 22], "naiv": [4, 5, 7, 8, 9], "parallel": [4, 7, 8, 9], "thread": [4, 6, 7, 8, 9, 10, 11, 16, 17, 19, 22, 23], "avoid": [4, 6, 15, 23], "confus": 4, "around": [4, 10], "term": 4, "refer": [4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 22], "shown": [4, 6, 17], "press": [4, 7, 8, 9, 11, 15], "shift": [4, 7, 8, 9, 11, 15], "writefil": [4, 15], "convolution_na": [4, 5], "cu": [4, 5, 12, 15, 19, 21], "__global__": [4, 7, 9, 11, 13, 15, 19, 21], "void": [4, 7, 8, 9, 11, 13, 15, 19, 20, 21], "convolution_kernel": [4, 5], "float": [4, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22], "int": [4, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "blockidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "blockdim": [4, 19, 22], "threadidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 10, 22], "our": [4, 7, 8, 9, 11, 15, 19, 20], "But": [4, 7, 8, 9, 11, 19], "data": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 20, 22], "np": [4, 6, 11, 15, 19, 20], "filter_s": 4, "17": [4, 5, 7, 8, 9, 12], "output_s": 4, "4096": [4, 5, 7, 8, 9, 12, 15], "prod": [4, 5, 12], "border_s": 4, "input_s": [4, 5, 12], "output_imag": 4, "zero": [4, 5, 11, 12, 15], "astyp": [4, 5, 7, 8, 9, 11, 12, 13, 15, 19, 21], "float32": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "input_imag": 4, "random": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21, 22], "randn": [4, 5, 12, 13, 15, 19, 21], "conv_filt": 4, "now": [4, 6, 7, 8, 9, 11, 12, 15, 19], "structur": [4, 6, 7, 8, 15, 19], "kernel_nam": [4, 6, 12, 20, 21, 22], "kernel_sourc": [4, 6, 20, 22], "problem_s": [4, 5, 6, 7, 8, 9, 11, 12, 15, 19, 20, 22, 23], "ellipsi": 4, "indic": [4, 18, 23], "mani": [4, 6, 7, 8, 9, 15, 16, 17, 18, 22], "won": 4, "right": [4, 7, 8, 9, 11, 14], "interest": [4, 10, 20], "five": [4, 6, 19], "string": [4, 6, 7, 8, 9, 10, 15, 16, 17, 19, 20, 22], "domain": [4, 7, 8, 9, 10, 11, 22], "three": [4, 5, 15], "dimens": [4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 22, 23], "dictionari": [4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "simpli": [4, 5, 6, 7, 8, 9, 11, 18, 19, 22], "cell": [4, 7, 8, 9, 11, 15], "wrote": 4, "determin": [4, 7, 8, 9, 11, 17, 18], "grid": [4, 6, 7, 8, 9, 10, 12, 15, 22, 23], "abov": [4, 6, 7, 8, 9, 11, 14, 15, 19, 20], "divid": [4, 7, 8, 9, 11, 12, 15, 22], "divisor": [4, 6, 7, 8, 9, 15, 22], "scalar": [4, 7, 8, 9, 11, 22], "therefor": [4, 5, 7, 8, 9, 11, 12, 15], "exactli": [4, 6, 7, 8, 9, 15, 17], "order": [4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19, 22], "match": [4, 5, 6], "32": [4, 6, 7, 8, 9, 11, 13, 15, 19, 22], "bit": [4, 6, 7, 8, 9, 11, 12, 15], "final": [4, 5, 7, 8, 9, 11], "anyth": 4, "insert": [4, 5, 6, 9, 11, 12, 15, 19, 21, 22, 23], "preprocessor": [4, 6, 22], "statement": [4, 9, 11, 15, 21], "valu": [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22], "were": [4, 7, 8, 9, 11, 15, 22], "i_like_convolut": 4, "42": 4, "definit": [4, 11, 22], "unless": 4, "cours": [4, 7, 8, 9, 14, 15], "somewher": 4, "token": 4, "freeli": 4, "few": [4, 7, 8, 9, 11, 12, 21], "special": [4, 7, 8, 9, 17, 19, 23], "notic": [4, 7, 8, 9], "haven": [4, 14], "yet": [4, 6, 11, 12, 19], "basic": [4, 6, 7, 8, 9, 19], "block_size_x": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "block_size_i": [4, 5, 7, 8, 9, 11, 12, 15, 22], "block_size_z": [4, 7, 8, 9, 11, 22], "interpret": 4, "z": [4, 6, 11, 22], "block_size_nam": [4, 6, 22], "let": [4, 6, 7, 8, 9, 19, 21], "creation": [4, 13, 18], "trusti": 4, "old": 4, "16": [4, 5, 7, 8, 9, 11, 12, 15], "dict": [4, 5, 6, 9, 12, 13, 17, 18, 19, 21, 22], "undefin": [4, 6, 7, 8, 9, 15], "filter_heigth": 4, "could": [4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 21, 22], "runtim": [4, 6, 7, 8, 9, 13, 14, 17, 21], "setup": [4, 7, 8, 9, 12, 14, 17, 20], "everyth": [4, 6, 7, 8, 9], "answer": [4, 5, 6, 7, 8, 9, 10, 22], "alloc": [4, 6, 7, 8, 9, 10, 12, 22], "move": [4, 6, 7, 12, 15, 18, 22], "content": [4, 6, 22], "deriv": [4, 6, 7, 8, 9, 16], "retriev": [4, 6, 22], "free": [4, 7, 8, 9, 12, 14, 15], "return": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "contrast": 4, "wa": [4, 6, 7, 8, 9, 17, 22], "finish": [4, 6, 8, 11, 12, 17], "than": [4, 7, 8, 9, 11, 16, 17, 18, 22, 23], "highli": [4, 13, 15], "parametr": 4, "long": [4, 7, 8, 9, 11, 12, 15, 20], "instead": [4, 6, 10, 15, 22], "littl": [4, 7, 8, 9, 15], "ve": [4, 7, 8, 9, 14, 15], "familiar": [4, 15], "kernel_str": [4, 5, 6, 7, 8, 9, 12, 13, 18, 22], "tune_param": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21, 22], "similarli": 4, "singl": [4, 5, 6, 7, 8, 9, 12, 15, 17, 21, 22], "wai": [4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22], "64": [4, 7, 8, 9, 13, 15, 19, 21], "128": [4, 7, 8, 9, 13, 19, 21], "try": [4, 6, 7, 8, 9, 14, 15, 18, 22], "env": [4, 6, 18, 19, 22], "cartesian": [4, 11], "product": [4, 7, 8, 22], "realli": [4, 7, 8, 9, 14], "howev": [4, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21, 22], "lot": [4, 7, 8, 9, 15, 17, 19, 20, 22], "problemat": 4, "explain": [4, 6, 7, 8, 9, 12, 14, 15, 16, 19, 21, 22], "illeg": 4, "2048": 4, "1024": [4, 7, 8, 9, 19], "fail": [4, 6, 14, 22], "too": [4, 7, 8, 9, 11, 12, 15, 22], "regist": [4, 7, 8, 9, 15, 17], "silent": 4, "verbos": [4, 5, 6, 7, 8, 9, 12, 22], "bound": [4, 6, 15, 18], "ignor": [4, 6, 7, 8, 9, 22], "two": [4, 6, 7, 8, 9, 10, 15, 16, 18, 22], "thing": [4, 12, 15], "record": [4, 6, 7, 17, 22], "show": [4, 7, 8, 9, 10, 13, 16, 20], "secondli": [4, 15], "experi": 4, "took": [4, 7, 9, 18, 19, 22], "place": [4, 7, 8, 9, 17, 18, 19, 22], "That": [4, 7, 8, 9, 12, 15, 16, 19], "mean": [4, 12, 15, 16, 18, 20, 21, 23], "softwar": [4, 7, 8, 9, 13, 14, 17, 18, 19], "along": [4, 6, 14, 19, 23], "second": [4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 22], "alwai": [4, 6, 7, 8, 9], "circumst": 4, "obtain": [4, 7, 8, 9, 11, 17], "promis": 4, "tile": [4, 10, 15], "factor": [4, 7, 8, 9, 10, 11, 15, 23], "amount": [4, 7, 8, 9, 15, 16, 22], "particular": [4, 6, 7, 8, 10, 12, 15, 17, 20], "increas": [4, 7, 8, 9, 17], "certain": [4, 6, 7, 8, 9, 10, 17, 23], "tile_size_x": [4, 5, 7, 8, 9, 12, 15], "4": [4, 7, 8, 9, 11, 15, 17], "tile_size_i": [4, 5, 7, 8, 9, 12, 15, 22], "understand": 4, "everi": [4, 5, 7, 8, 9, 10, 17, 19], "fewer": [4, 7, 8, 9], "total": [4, 6, 7, 8, 9, 15, 16, 19], "stai": 4, "tell": [4, 7, 8, 9, 10, 12, 15, 19, 20], "influenc": 4, "did": [4, 7, 8, 9, 15], "mimick": 4, "behavior": [4, 15, 17, 22], "assum": [4, 6, 7, 8, 9, 15, 22], "far": [4, 7, 8, 9, 15, 19], "grid_div_x": [4, 5, 7, 8, 9, 12, 15, 22], "grid_div_i": [4, 5, 7, 8, 9, 12, 15, 22], "decreas": [4, 15], "correspondingli": 4, "displai": 4, "commonli": [4, 7, 8, 9, 14, 15], "gflop": [4, 6, 10, 15, 16], "giga": [4, 15], "compos": [4, 6, 15, 16], "lambda": [4, 6, 7, 8, 15, 16, 22], "collect": [4, 6, 7, 8, 9, 11, 15, 17, 20], "ordereddict": [4, 7, 8, 9, 11, 15, 16], "p": [4, 6, 15, 16, 20, 22], "1e9": [4, 15], "1e3": [4, 7, 8, 9, 15, 16], "expand": [4, 13, 15, 17], "sinc": [4, 9, 11, 13, 15, 21], "And": [4, 7, 8, 9, 18, 21, 22], "know": [4, 7, 8, 9, 15, 16], "enough": [4, 5, 15], "abl": [4, 6, 7, 8, 9], "own": [4, 9, 12, 14, 16, 17], "whenev": 5, "good": [5, 7, 8, 9, 23], "fast": [5, 7, 8, 9], "instanc": [5, 6, 7, 8, 9, 12, 17, 22], "none": [5, 6, 17, 18, 22], "onc": [5, 6, 7, 8, 9, 11, 17, 22], "comparison": [5, 13], "allclos": [5, 22], "maximum": [5, 6, 11, 18, 22], "absolut": [5, 22], "1e": [5, 22], "6": [5, 7, 8, 9, 11, 12, 22], "toler": 5, "atol": [5, 6, 22], "convolution_correct": 5, "py": [5, 12, 14], "demonstr": [5, 9, 10, 15], "r": [5, 12], "cmem_arg": [5, 6, 22], "d_filter": 5, "arg": [5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21], "field": [5, 7, 8, 9], "its": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 22], "almost": [5, 7, 8, 9, 17], "whose": [5, 22], "trust": [5, 18], "construct": [5, 15], "There": [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 22, 23], "precomput": 5, "flexibl": [5, 7, 8, 15], "callabl": [5, 6, 22], "accept": [5, 6, 18, 22], "cpu_result": 5, "gpu_result": [5, 7, 9], "although": 5, "semant": 5, "posit": [5, 6, 11, 18, 21, 22], "reflect": [5, 17], "reduct": [5, 16, 22], "snippet": 5, "sum_x": 5, "custom": [5, 10, 16, 17, 20], "def": [5, 6, 7, 8, 9, 11, 17, 20], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 7, 8, 9], "sum_float": 5, "map": [5, 10, 11], "third": [5, 15], "partial": [5, 7, 8, 9, 10], "cpu": [5, 8, 9, 12], "achiev": [5, 9], "element": [5, 7, 8, 9, 15, 16, 19, 20, 22], "necessarili": [5, 12], "section": [6, 7, 8, 9], "intern": [6, 13, 18, 21], "mostli": [6, 13, 22], "relev": [6, 13, 17], "develop": [6, 10, 13, 14], "extens": 6, "architectur": [6, 17], "At": [6, 11, 22], "expos": 6, "respons": 6, "iter": [6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "brute_forc": [6, 22], "valid": [6, 10, 15, 22], "random_sampl": [6, 22], "sampl": [6, 18, 22], "advanc": [6, 21, 22], "being": [6, 7, 8, 9, 15, 17, 18, 22], "strategy_opt": [6, 18, 22], "sai": [6, 7, 8, 9, 19, 21], "foreseen": 6, "futur": [6, 13, 22, 23], "high": [6, 7, 8, 9, 13, 15, 17], "low": [6, 7, 8, 9, 15], "abstract": [6, 17], "ready_argument_list": 6, "build": [6, 7, 8, 9], "bottom": 6, "either": [6, 11, 18, 21, 22], "typic": [6, 14, 15, 22], "gcc": 6, "fortran": [6, 10, 21], "turn": 6, "launch": [6, 7, 8, 9, 12, 17, 22], "rest": [6, 7, 8, 9], "helper": [6, 17], "get_opt": 6, "suppli": [6, 12, 15, 18, 21, 22], "get_strategy_docstr": 6, "method": [6, 7, 8, 9, 12, 15, 17, 18], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 18], "func": [6, 17, 22], "invers": 6, "unscal": 6, "setup_method_argu": 6, "setup_method_opt": 6, "tuning_opt": [6, 18], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "nearest": [6, 22], "class": [6, 17, 18], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 21], "kernelsourc": 6, "parameter_spac": [6, 18], "iterfac": 6, "platform": [6, 13, 14, 17, 22], "quiet": [6, 22], "compiler_opt": [6, 22], "7": [6, 7, 8, 9, 11, 22], "offer": 6, "bool": [6, 20, 22], "gpu_arg": 6, "skip_nvml_set": 6, "benchmark_continu": 6, "durat": [6, 17], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 14, 17], "copy_shared_memory_arg": 6, "smem_arg": [6, 22], "copy_texture_memory_arg": 6, "texmem_arg": [6, 22], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 7], "dest": 6, "src": 6, "copi": [6, 7, 8, 9, 12, 19, 22], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 7, 8, 9, 11, 17, 18, 22], "mem": 6, "set_nvml_paramet": 6, "nvml": [6, 23], "leak": 6, "group": [6, 7, 8, 9, 22], "maintain": 6, "state": [6, 7, 8, 9, 17, 22], "interact": [6, 17], "properti": [6, 15, 22], "context": [6, 7, 9, 11], "kernel_inst": 6, "lookup": 6, "directli": [6, 7, 8, 9, 12, 15, 17, 21, 22], "ndarrai": [6, 11], "format": [6, 7, 8, 20], "kei": [6, 7, 8, 9, 15, 18, 19, 22], "symbol": [6, 22], "similar": [6, 12, 15, 22], "regular": [6, 9, 17], "int32": [6, 13, 19, 21, 22], "kernel_finish": 6, "devicealloc": 6, "memcpy_htod": [6, 7], "memset": 6, "unsign": [6, 8], "byte": [6, 20, 22], "tupl": [6, 9, 11, 18, 22], "start_ev": 6, "event": [6, 7, 12, 17], "mark": 6, "stop_ev": 6, "synchron": [6, 7, 9, 11, 15, 16], "halt": [6, 12], "until": [6, 12], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "must": [6, 16, 22], "buffer": [6, 8, 20], "fill": [6, 15], "item": [6, 7, 8, 9, 11], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "previous": [6, 7, 8, 9, 15], "librari": [6, 10, 17, 20], "kernelinst": 6, "repres": [6, 7, 8, 9], "tunabl": [6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 19, 21, 22, 23], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 12], "c_arg": 6, "robust": 6, "averag": [6, 7, 8, 9, 12, 17], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 10, 12, 21], "dump": [6, 7, 8], "json": [6, 7, 8, 10, 22], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 18], "criterion": [6, 18], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 20], "kernel_argu": 6, "check_restrict": 6, "whether": [6, 16, 18, 22], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 18, 22], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 22], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 7, 8, 9, 11], "constraint": 6, "pars": [6, 7, 8], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 7, 8], "backward": 6, "correct_open_cach": 6, "open_cach": 6, "properli": 6, "close": [6, 7, 8, 9], "pretend": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "complain": 6, "detect_languag": 6, "attempt": [6, 21], "detect": [6, 18, 21, 22], "dump_cach": 6, "omit": 6, "sever": [6, 7, 8, 9, 10, 11, 14, 15, 21, 22], "store_cach": 6, "speed": 6, "great": [6, 7, 8, 9, 19], "power": [6, 15, 17, 23], "get_best_config": 6, "objective_higher_is_bett": [6, 16, 22], "best": [6, 7, 8, 11, 15, 18, 21, 22, 23], "accord": [6, 22], "get_config_str": 6, "compact": 6, "represent": [6, 20], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 7, 8, 9], "One": [6, 7, 8, 9, 17, 20], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 22], "form": [6, 15, 17, 18], "temp_x": 6, "larg": [6, 7, 8, 9, 11, 22], "integ": [6, 17, 20, 22], "get_thread_block_dimens": 6, "convent": [6, 12, 22], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "v": [6, 7, 8, 9, 11], "normal": [6, 18, 22], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 9], "seri": [6, 11], "By": [6, 12, 15, 18, 22], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 22], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "bracket": 6, "earlier": [6, 7, 8, 9, 11], "abruptli": 6, "process_metr": 6, "calcul": [6, 11], "express": [6, 7, 8, 9, 10, 12, 15, 22], "10000": 6, "read_cach": 6, "cachefil": [6, 22], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "to_valid_nvrtc_gpu_arch_cc": 6, "compute_cap": 6, "index": [6, 18], "group__opt": 6, "write_fil": 6, "whole": [7, 8, 9, 15, 18], "model": [7, 8, 9, 13], "physic": 7, "numer": [7, 8, 9], "introduc": [7, 8, 9, 15, 17], "redistribut": [7, 8, 9], "region": [7, 8, 9], "concentr": [7, 8, 9], "bulk": [7, 8, 9], "motion": [7, 8, 9], "concept": [7, 8, 9], "wide": [7, 8, 9, 14, 15], "chemistri": [7, 8, 9], "biologi": [7, 8, 9], "suppos": [7, 8, 9], "metal": [7, 8, 9], "sheet": [7, 8, 9], "temperatur": [7, 8, 9, 17, 18, 23], "equal": [7, 8, 9, 15, 22], "degre": [7, 8, 9], "everywher": [7, 8, 9], "heat": [7, 8, 9], "thousand": [7, 8, 9], "instant": [7, 8, 9, 11], "hotspot": [7, 8, 9], "cooler": [7, 8, 9], "area": [7, 8, 9, 15], "melt": [7, 8, 9], "loss": [7, 8, 9], "radiat": [7, 8, 9], "frac": [7, 8, 9], "d": [7, 8, 9, 11, 18, 19], "spatial": [7, 8, 9], "descret": [7, 8, 9], "2d": [7, 8, 9, 10], "quantiti": [7, 8, 9, 16, 17, 22], "nx": [7, 8, 9, 11], "equi": [7, 8, 9], "distant": [7, 8, 9], "direct": [7, 8, 9, 12, 15, 16, 22], "ny": [7, 8, 9, 11], "distanc": [7, 8, 9, 18], "delta": [7, 8, 9], "central": [7, 8, 9], "approxim": [7, 8, 9], "x_i": [7, 8, 9, 11], "x_": [7, 8, 9], "approx": [7, 8, 9], "u_": [7, 8, 9], "2u_": [7, 8, 9], "y_": [7, 8, 9], "estim": [7, 8, 9], "next": [7, 8, 9, 15, 20], "simplifi": [7, 8, 9], "formula": [7, 8, 9], "4u_": [7, 8, 9], "simplic": [7, 8, 9, 11], "assumpt": [7, 8, 9], "boundari": [7, 8, 9], "condit": [7, 8, 9, 15], "dt": [7, 8, 9], "225": [7, 8, 9], "test": [7, 8, 9, 10, 14, 15, 17, 22], "initi": [7, 8, 9, 20], "hot": [7, 8, 9], "plot": [7, 8, 9], "color": [7, 8, 9], "matplotlib": [7, 8, 9, 14], "pyplot": [7, 8, 9], "inlin": [7, 8, 9], "get_initial_condit": [7, 8, 9], "ones": [7, 8, 9, 23], "randint": [7, 8, 9], "1000": [7, 8, 9, 11], "2000": [7, 8, 9], "fig": [7, 8, 9], "ax1": [7, 8, 9], "ax2": [7, 8, 9], "subplot": [7, 8, 9], "imshow": [7, 8, 9], "lt": [7, 8, 9], "axesimag": [7, 8, 9], "0x2aaab952f240": 7, "gt": [7, 8, 9], "quick": [7, 8, 9], "later": [7, 8, 9, 11, 22], "field_copi": [7, 8], "4164": 7, "018869400024": 7, "0x2aab1c98b3c8": 7, "worri": [7, 9], "terminologi": [7, 9], "text": [7, 9, 15], "5": [7, 8, 9, 11, 18], "225f": [7, 8, 9], "diffuse_kernel": [7, 8, 9], "u_new": [7, 8, 9], "0f": [7, 8, 9], "togeth": [7, 8, 9, 14, 22], "impact": [7, 8, 9, 12], "fix": [7, 8, 9, 18, 22], "unrol": [7, 8, 9, 10, 15, 23], "loop": [7, 8, 9, 10, 15, 23], "drv": 7, "sourcemodul": [7, 9, 11], "init": 7, "make_context": 7, "devprop": 7, "k": [7, 8, 9, 11, 13, 15, 19], "get_devic": 7, "get_attribut": 7, "cc": 7, "compute_capability_major": 7, "compute_capability_minor": 7, "u_old": [7, 9], "mem_alloc": 7, "nbyte": 7, "block_size_str": [7, 9], "arch": 7, "sm_": 7, "get_funct": [7, 9, 11], "boilerpl": [7, 8, 9], "moment": [7, 8, 9, 22], "serv": [7, 8, 9, 16, 18], "guess": [7, 8, 9], "pair": [7, 8, 9], "500": [7, 8, 9], "time_sinc": 7, "zeros_lik": [7, 11, 13, 15, 19, 21], "set_titl": [7, 8, 9], "53": [7, 8, 9], "423038482666016": 7, "0x2aaabbdcb2e8": 7, "faster": [7, 8, 9, 15], "cleanup": 7, "pop": 7, "think": [7, 8, 9], "messi": [7, 8, 9], "got": [7, 8, 9], "cleaner": [7, 8, 9], "plai": [7, 8, 9], "difficult": [7, 8, 9, 20, 21], "rather": [7, 8, 9, 22], "underutil": [7, 8, 9], "purpos": [7, 8, 9, 12, 15, 22, 23], "feel": [7, 8, 9], "48": [7, 8, 9], "care": [7, 8, 9], "appropi": [7, 8, 9], "fly": [7, 8, 9], "12": [7, 8, 9], "13": [7, 8, 9], "geforc": [7, 8, 9, 11], "gtx": [7, 8, 9, 11], "titan": [7, 8, 9], "22305920124": 7, "779033613205": 7, "824838399887": 7, "900499212742": 7, "999763202667": 7, "727967989445": 7, "752479994297": 7, "797900807858": 7, "876627194881": 7, "93347837925": 7, "766662418842": 7, "803033602238": 7, "853574407101": 7, "971545600891": 7, "763775992393": 7, "791257584095": 7, "848044800758": 7, "922745585442": 7, "792595207691": 7, "822137594223": 7, "893279993534": 7, "millisecond": [7, 8, 9], "matter": [7, 8, 9, 12], "analyz": [7, 8, 9], "seem": [7, 8, 9], "vari": [7, 8, 9, 11, 15, 16], "addtion": [7, 8, 9], "among": [7, 8, 9, 13, 18], "128x32": [7, 8, 9], "likewis": [7, 8, 9], "becom": [7, 8, 9, 17, 18], "affect": [7, 8, 9, 15], "within": [7, 8, 9, 11, 15, 18, 22], "exchang": [7, 8, 9], "fact": [7, 8, 9, 12], "commun": [7, 8, 9], "idea": [7, 8, 9, 12, 15, 23], "l2": [7, 8, 9], "closer": [7, 8, 9], "multiprocessor": [7, 8, 9], "l1": [7, 8, 9], "fine": [7, 8, 9], "grain": [7, 8, 9], "manag": [7, 8, 9, 15, 17], "cost": [7, 8, 9, 18], "overhead": [7, 8, 9, 15], "degrad": [7, 8, 9], "intermedi": [7, 8, 9], "mind": [7, 8, 9], "14": [7, 8, 9], "tx": [7, 8, 9, 15], "ty": [7, 8, 9, 15], "bx": [7, 8, 9, 11], "__shared__": [7, 9, 15], "sh_u": [7, 8, 9], "pragma": [7, 8, 9, 15], "__syncthread": [7, 8, 9, 15], "75041918755": 7, "18713598251": 7, "09015038013": 7, "06844799519": 7, "09730558395": 7, "14420480728": 7, "05957758427": 7, "07508480549": 7, "0731967926": 7, "14729599953": 7, "08389122486": 7, "10700161457": 7, "10125439167": 7, "31661438942": 7, "0629119873": 7, "04807043076": 7, "054880023": 7, "12033278942": 7, "06672639847": 7, "05816960335": 7, "12000002861": 7, "merg": [7, 8, 9, 15], "half": [7, 8, 9], "doubl": [7, 8, 9, 20, 21], "cover": [7, 8, 9, 18], "beyond": [7, 8, 9, 22], "reduc": [7, 8, 9, 15], "condens": [7, 8, 9], "keep": [7, 8, 9, 15, 20], "importantli": [7, 8, 9], "worst": [7, 8, 9], "15": [7, 8, 9, 21], "tj": [7, 8, 9], "ti": [7, 8, 9, 11], "somehow": [7, 8, 9], "larger": [7, 8, 9, 12, 18, 21], "insid": [7, 8, 9, 12, 15, 21, 22], "round": [7, 8, 9, 22], "arithmet": [7, 8, 9, 22], "evalu": [7, 8, 9, 15, 18, 22], "759308815": 7, "29789438248": 7, "06983039379": 7, "2634239912": 7, "997139203548": 7, "843692803383": 7, "05549435616": 7, "862348806858": 7, "750636804104": 7, "19084160328": 7, "876377594471": 7, "714169609547": 7, "875001597404": 7, "691116797924": 7, "575859189034": 7, "759679996967": 7, "622867202759": 7, "650336003304": 7, "09794559479": 7, "826515209675": 7, "692665600777": 7, "78363519907": 7, "646092808247": 7, "554745602608": 7, "716115188599": 7, "581280004978": 7, "662566399574": 7, "07386879921": 7, "833420813084": 7, "705055999756": 7, "840755212307": 7, "652575993538": 7, "569388794899": 7, "689356791973": 7, "597267186642": 7, "675232005119": 7, "10033922195": 7, "860332798958": 7, "731891202927": 7, "867276787758": 7, "68781440258": 7, "595276796818": 7, "735436797142": 7, "60216319561": 7, "852166390419": 7, "15089921951": 7, "852575981617": 7, "705932807922": 7, "888671982288": 7, "673248004913": 7, "563417613506": 7, "761139214039": 7, "621254396439": 7, "676595199108": 7, "06709122658": 7, "804953610897": 7, "685670387745": 7, "801798415184": 7, "632006394863": 7, "542387211323": 7, "722668802738": 7, "578745603561": 7, "618598401546": 7, "08220798969": 7, "821881604195": 7, "687955200672": 7, "77759360075": 7, "618003201485": 7, "539891195297": 7, "705900788307": 7, "568556785583": 7, "624492788315": 7, "0799423933": 7, "832300806046": 7, "70140799284": 7, "835481595993": 7, "638348805904": 7, "550105595589": 7, "667251205444": 7, "576044797897": 7, "732409596443": 7, "15916161537": 7, "869497597218": 7, "733248019218": 7, "890803205967": 7, "677363204956": 7, "577215993404": 7, "730982398987": 7, "58035838604": 7, "10066559315": 7, "837804794312": 7, "691385602951": 7, "851040017605": 7, "666656005383": 7, "560505592823": 7, "771103990078": 7, "626163220406": 7, "694451200962": 7, "11514236927": 7, "837299215794": 7, "703302407265": 7, "806828796864": 7, "648620784283": 7, "562521612644": 7, "760915207863": 7, "605760002136": 7, "690009605885": 7, "10740480423": 7, "841631996632": 7, "700883197784": 7, "838195204735": 7, "649779188633": 7, "56585599184": 7, "7168192029": 7, "59088640213": 7, "69627519846": 7, "3269824028": 7, "02665598392": 7, "840908801556": 7, "03752319813": 7, "788345599174": 7, "662041604519": 7, "85437438488": 7, "680422389507": 7, "0759360075": 7, "801996803284": 7, "666003203392": 7, "808000004292": 7, "643359994888": 7, "544691193104": 7, "741964805126": 7, "60942081213": 7, "681350398064": 7, "05262081623": 7, "792108798027": 7, "66344319582": 7, "768064010143": 7, "625260794163": 7, "540352010727": 7, "721862399578": 7, "579411196709": 7, "626976013184": 7, "06332798004": 7, "808211183548": 7, "679372787476": 7, "803718411922": 7, "627136015892": 7, "538227200508": 7, "682188808918": 7, "573836791515": 7, "725548803806": 7, "13023357391": 7, "843411195278": 7, "713843202591": 7, "85886080265": 7, "657920002937": 7, "565254402161": 7, "697094392776": 7, "579904007912": 7, "07484800816": 7, "801119995117": 7, "667347204685": 7, "799059200287": 7, "643820810318": 7, "542937588692": 7, "740518403053": 7, "615148806572": 7, "731334400177": 7, "07002239227": 7, "805299210548": 7, "675923216343": 7, "782060790062": 7, "631142401695": 7, "540383994579": 7, "723999989033": 7, "578681600094": 7, "726335990429": 7, "13297917843": 7, "844428789616": 7, "710278391838": 7, "835494399071": 7, "637958395481": 7, "567417597771": 7, "699366402626": 7, "588492810726": 7, "tri": [7, 8, 9, 18], "grow": [7, 8, 9], "quickli": [7, 8, 9], "went": [7, 8, 9, 11], "72": [7, 8, 9], "26": [7, 8, 9], "32x2": [7, 8, 9], "64x4": [7, 8, 9], "four": [7, 8, 9], "best_tim": [7, 8], "min": [7, 8], "05": [7, 8], "join": [7, 8], "nice": [7, 8], "stdout": [7, 8], "why": [7, 8, 12, 16], "easili": [7, 8, 17], "easi": [7, 8, 16, 17, 22], "csv": [7, 8, 10], "analysi": [7, 8, 13], "panda": [7, 8, 10, 14], "18": [7, 8, 9], "fp": [7, 8], "datafram": [7, 8], "df": [7, 8], "to_csv": [7, 8], "0x2aab1de088d0": 8, "01": 8, "sy": 8, "140": 8, "wall": 8, "98": 8, "__kernel": 8, "get_group_id": 8, "get_local_id": 8, "cl": 8, "ctx": 8, "create_some_context": 8, "mf": 8, "mem_flag": 8, "a_h": 8, "a_d": 8, "read_writ": 8, "copy_host_ptr": 8, "hostbuf": 8, "b_d": 8, "kernel_src": 8, "prg": 8, "queue": 8, "commandqueu": 8, "run_gpu": 8, "444": 8, "154": 8, "598": 8, "985": 8, "enqueue_copi": 8, "1748096": 8, "7284544": 8, "7707904": 8, "8573184": 8, "8380288": 8, "686528": 8, "69648": 8, "7461632": 8, "818304": 8, "771072": 8, "7190464": 8, "7522432": 8, "7982208": 8, "9624512": 8, "7214464": 8, "7453312": 8, "8028416": 8, "8922624": 8, "747328": 8, "7860736": 8, "8637184": 8, "__local": 8, "barrier": 8, "clk_local_mem_f": 8, "8449472": 8, "1912576": 8, "1035136": 8, "0927808": 8, "1140736": 8, "1790336": 8, "0808192": 8, "0809792": 8, "0836928": 8, "1545856": 8, "1249984": 8, "1264": 8, "1230336": 8, "4015104": 8, "0873216": 8, "0626496": 8, "0692224": 8, "140192": 8, "0801344": 8, "0688128": 8, "1428928": 8, "8844544": 8, "3245952": 8, "0911808": 8, "3039616": 8, "0079296": 8, "84848": 8, "0708288": 8, "857728": 8, "7561792": 8, "231072": 8, "8774336": 8, "7087296": 8, "8772672": 8, "6911872": 8, "5715968": 8, "7584896": 8, "6292032": 8, "6498688": 8, "1145664": 8, "8252928": 8, "6757568": 8, "7881152": 8, "6237696": 8, "544224": 8, "6951168": 8, "5648128": 8, "6452736": 8, "1065792": 8, "8313792": 8, "6905984": 8, "8302656": 8, "6367488": 8, "5478592": 8, "6660672": 8, "5719744": 8, "6551744": 8, "1384064": 8, "8531072": 8, "7078976": 8, "8516672": 8, "6677696": 8, "5685632": 8, "7074048": 8, "5753152": 8, "8228864": 8, "2124736": 8, "8633344": 8, "6921216": 8, "8896384": 8, "6659904": 8, "5582144": 8, "7522624": 8, "6081536": 8, "6664448": 8, "1095936": 8, "8063424": 8, "6717888": 8, "7982848": 8, "6263552": 8, "5289728": 8, "7008832": 8, "567456": 8, "5968704": 8, "1018432": 8, "8117248": 8, "6724736": 8, "7728576": 8, "6038336": 8, "5172352": 8, "6796352": 8, "5470016": 8, "5968448": 8, "1107712": 8, "8237248": 8, "6810944": 8, "821952": 8, "620352": 8, "5230208": 8, "6415552": 8, "5476864": 8, "7168192": 8, "1942016": 8, "8626304": 8, "7099712": 8, "9123328": 8, "6608448": 8, "5631168": 8, "7113024": 8, "556576": 8, "1583104": 8, "8384832": 8, "67856": 8, "845856": 8, "6581248": 8, "54944": 8, "7520064": 8, "6076224": 8, "6842112": 8, "1547072": 8, "8422016": 8, "6895552": 8, "8037312": 8, "6387072": 8, "5383296": 8, "7326656": 8, "5863488": 8, "6813376": 8, "1493952": 8, "8444928": 8, "6929216": 8, "832768": 8, "6389312": 8, "5412672": 8, "698336": 8, "5717568": 8, "676096": 8, "4303104": 8, "0341696": 8, "8365184": 8, "0398656": 8, "7786496": 8, "648928": 8, "8479232": 8, "6508544": 8, "1219392": 8, "7994048": 8, "6492288": 8, "8068416": 8, "6343168": 8, "5235328": 8, "7268928": 8, "5898432": 8, "6633536": 8, "0849664": 8, "7869632": 8, "6458624": 8, "7611968": 8, "613088": 8, "50912": 8, "6972928": 8, "5620608": 8, "601856": 8, "095232": 8, "7967488": 8, "6601472": 8, "7952896": 8, "6047296": 8, "5108224": 8, "6607744": 8, "5492416": 8, "7091136": 8, "171552": 8, "8473408": 8, "6962112": 8, "8663936": 8, "6466816": 8, "5475584": 8, "6754048": 8, "5591744": 8, "108896": 8, "7907264": 8, "6459328": 8, "7965888": 8, "6250816": 8, "5188416": 8, "721408": 8, "5920832": 8, "7068608": 8, "0909248": 8, "7930752": 8, "6524544": 8, "7745216": 8, "6146176": 8, "5116928": 8, "6975872": 8, "5548416": 8, "7075136": 8, "174624": 8, "8384512": 8, "69104": 8, "8335488": 8, "6264192": 8, "5445248": 8, "6719104": 8, "5592064": 8, "19": [8, 9], "solv": 9, "0x7f888f8cd7b8": 9, "4152": 9, "086019515991": 9, "0x7f8865b51f28": 9, "gpuarrai": [9, 11], "tool": [9, 11, 13], "autoinit": [9, 11], "to_gpu": [9, 11], "mod": [9, 11], "t0": [9, 11], "ona": 9, "33": 9, "46109390258789": 9, "0x7f8858b873c8": 9, "1080": [9, 11], "916985595226": 9, "489004802704": 9, "500524806976": 9, "513356792927": 9, "545715200901": 9, "486515200138": 9, "449055999517": 9, "44974719882": 9, "457427197695": 9, "492915201187": 9, "464863997698": 9, "466118401289": 9, "475264000893": 9, "513632011414": 9, "458412796259": 9, "457715201378": 9, "461017608643": 9, "475987195969": 9, "460032004118": 9, "457779198885": 9, "462649595737": 9, "kernel_string_shar": 9, "22673916817": 9, "826361596584": 9, "793516802788": 9, "782112002373": 9, "776639997959": 9, "795135998726": 9, "722777605057": 9, "762777590752": 9, "75422719717": 9, "804876792431": 9, "778656005859": 9, "769734406471": 9, "782495999336": 9, "932281601429": 9, "734028804302": 9, "721625590324": 9, "736511993408": 9, "800019192696": 9, "724966406822": 9, "722969603539": 9, "759430396557": 9, "kernel_string_til": 9, "22200961113": 9, "91601279974": 9, "752838408947": 9, "873651194572": 9, "69833599329": 9, "586931192875": 9, "516473591328": 9, "411392003298": 9, "384262400866": 9, "82159358263": 9, "632607996464": 9, "506457602978": 9, "618758392334": 9, "500288009644": 9, "429862397909": 9, "44995200038": 9, "366150397062": 9, "342201602459": 9, "793542397022": 9, "58026239872": 9, "494163197279": 9, "546316814423": 9, "467059195042": 9, "404249596596": 9, "440895992517": 9, "341376006603": 9, "339692795277": 9, "783923208714": 9, "597920000553": 9, "50277120471": 9, "615475213528": 9, "470937597752": 9, "418393599987": 9, "443519997597": 9, "343961596489": 9, "342540800571": 9, "780352008343": 9, "611705589294": 9, "515667212009": 9, "622534394264": 9, "502195191383": 9, "437388807535": 9, "45568639636": 9, "359289598465": 9, "426995199919": 9, "788947200775": 9, "616556799412": 9, "496121603251": 9, "629164803028": 9, "474841600657": 9, "407667201757": 9, "47406719923": 9, "371507203579": 9, "352531200647": 9, "72023679018": 9, "574816000462": 9, "481817597151": 9, "580928003788": 9, "455724793673": 9, "394975996017": 9, "464659202099": 9, "357107198238": 9, "324083191156": 9, "759910392761": 9, "569177603722": 9, "481279999018": 9, "528115200996": 9, "441734397411": 9, "393126398325": 9, "455404800177": 9, "350457596779": 9, "322547197342": 9, "754201591015": 9, "579827189445": 9, "491852802038": 9, "582751989365": 9, "451283198595": 9, "391807991266": 9, "456275194883": 9, "356716805696": 9, "362937599421": 9, "809894394875": 9, "60433280468": 9, "507142400742": 9, "655827200413": 9, "474092799425": 9, "408166396618": 9, "480531209707": 9, "346707201004": 9, "780134403706": 9, "601049602032": 9, "493900799751": 9, "620384001732": 9, "494553589821": 9, "425414395332": 9, "467033600807": 9, "375468802452": 9, "346079999208": 9, "771052801609": 9, "593977594376": 9, "49723520875": 9, "583270406723": 9, "478079998493": 9, "416320002079": 9, "443942397833": 9, "359744000435": 9, "343545603752": 9, "780960011482": 9, "598758399487": 9, "498617601395": 9, "57678719759": 9, "46561280489": 9, "41324160099": 9, "431225597858": 9, "351263999939": 9, "34440960288": 9, "933260798454": 9, "715257608891": 9, "586604809761": 9, "711615991592": 9, "558771193027": 9, "466284793615": 9, "44043520093": 9, "361823999882": 9, "731839990616": 9, "57044479847": 9, "470220798254": 9, "608800005913": 9, "472665601969": 9, "416352003813": 9, "481376004219": 9, "380812799931": 9, "351923197508": 9, "719257593155": 9, "55171200037": 9, "466758400202": 9, "568435204029": 9, "459654402733": 9, "394380801916": 9, "463052803278": 9, "36409599781": 9, "328998398781": 9, "73579518795": 9, "564575994015": 9, "472236800194": 9, "549024009705": 9, "438406395912": 9, "389945602417": 9, "455193603039": 9, "364051198959": 9, "375519996881": 9, "798195195198": 9, "588998401165": 9, "49552000761": 9, "595462405682": 9, "460972803831": 9, "400672000647": 9, "465132802725": 9, "364627194405": 9, "729363203049": 9, "558815991879": 9, "466655993462": 9, "600819194317": 9, "460281592607": 9, "404908800125": 9, "478739196062": 9, "386668801308": 9, "385510402918": 9, "720915210247": 9, "550668799877": 9, "466937589645": 9, "564921605587": 9, "447974395752": 9, "394271999598": 9, "46233600378": 9, "365190398693": 9, "387827193737": 9, "762003195286": 9, "579007995129": 9, "486649608612": 9, "557331204414": 9, "443033593893": 9, "396070402861": 9, "457075202465": 9, "369555193186": 9, "wish": 9, "modifi": [9, 17], "tile_size_j": 9, "fixed_param": [9, 11], "ceil": [9, 11], "zip": [9, 11], "transfer": [9, 10, 12], "20": [9, 18], "21": 9, "618": 9, "2231903076172": 9, "0x7f887c3d2358": 9, "incorpor": 9, "ifndef": 9, "kerenel": 9, "psedo": 9, "endif": 9, "bypass": 9, "usecas": 10, "test_vector_add": 10, "test_vector_add_parameter": 10, "highlight": 10, "contact": 10, "illustr": 10, "openacc": 10, "dimension": [10, 11, 22], "clean": [10, 15], "center": [10, 11], "lock": [10, 17], "overlap": [10, 12], "shuffl": 10, "pipelin": 10, "consist": [10, 15, 22], "scipi": 10, "algorithm": [10, 13, 18, 22], "cub": 10, "gaussian": 11, "delv": 11, "hand": [11, 15], "sum_": 11, "exp": 11, "beta": [11, 18], "sqrt": 11, "y_i": 11, "z_i": 11, "vector": [11, 12, 19], "coordin": 11, "linalg": 11, "la": 11, "compute_grid": 11, "xgrid": 11, "ygrid": 11, "zgrid": 11, "x0": 11, "y0": 11, "z0": 11, "themselv": 11, "meshgrid": 11, "send": 11, "interv": 11, "256": [11, 13, 19], "suffici": [11, 16], "100": [11, 18, 22], "randomli": [11, 18], "distribut": [11, 15], "linspac": 11, "cpu_grid": 11, "npt": 11, "rand": 11, "xyz": [11, 22], "52320": 11, "160627": 11, "might": [11, 16], "nz": 11, "bz": 11, "kernel_cod": 11, "math": 11, "__host__": 11, "__device__": [11, 21], "b": [11, 13, 15, 18, 19, 21], "addgrid": 11, "xvect": 11, "yvect": 11, "zvect": 11, "dx": 11, "dy": 11, "dz": 11, "assign": 11, "explor": 11, "middl": 11, "henc": [11, 20], "mention": 11, "56833920479": 11, "80796158314": 11, "940044796467": 11, "855628800392": 11, "855359995365": 11, "16174077988": 11, "11877760887": 11, "01592960358": 11, "849273598194": 11, "849235200882": 11, "19029750824": 11, "16199679375": 11, "40401918888": 11, "39618558884": 11, "39508478642": 11, "31647996902": 11, "31470079422": 11, "50787198544": 11, "53760001659": 11, "56709756851": 11, "34500494003": 11, "25130877495": 11, "50662400723": 11, "55267841816": 11, "17987194061": 11, "12309756279": 11, "01125121117": 11, "849631989002": 11, "853708791733": 11, "17051515579": 11, "15584001541": 11, "40074241161": 11, "39547519684": 11, "39331197739": 11, "30295038223": 11, "28725762367": 11, "39589118958": 11, "38867840767": 11, "37724158764": 11, "34344320297": 11, "26213116646": 11, "38793599606": 11, "3775359869": 11, "74003200531": 11, "13276162148": 11, "37233917713": 11, "18835201263": 11, "15777277946": 11, "40247042179": 11, "39366400242": 11, "39439997673": 11, "23719043732": 11, "28542718887": 11, "39207677841": 11, "38956804276": 11, "3778496027": 11, "29814395905": 11, "26398081779": 11, "38625922203": 11, "3754431963": 11, "72981758118": 11, "12483196259": 11, "37322881222": 11, "61618566513": 11, "2194111824": 11, "17600002289": 11, "27082881927": 11, "38787200451": 11, "3835711956": 11, "37543039322": 11, "30227203369": 11, "23127679825": 11, "38627202511": 11, "37677440643": 11, "64358406067": 11, "12255358696": 11, "37474560738": 11, "61655673981": 11, "19179515839": 11, "99912958145": 11, "213971138": 11, "16430072784": 11, "38772480488": 11, "3735104084": 11, "54432649612": 11, "05524477959": 11, "36935677528": 11, "42449922562": 11, "10455036163": 11, "67516155243": 11, "programmat": 11, "30": 11, "minimum": 11, "84": 11, "suit": [11, 22], "grid_dim": 11, "associ": 11, "substitut": 11, "ourselv": 11, "extract": 11, "manual": [11, 14], "exlicitli": 11, "accur": [11, 17], "xgpu": 11, "ygpu": 11, "zgpu": 11, "grid_gpu": 11, "80": 11, "133200": 11, "lower": [11, 17, 18], "roughli": [11, 15], "40000": 11, "across": [12, 15], "qualiti": 12, "itself": [12, 13, 22], "precis": 12, "plain": 12, "omp_get_wtim": 12, "openmp": 12, "convolution_stream": 12, "complex": [12, 15], "behind": 12, "spread": 12, "back": [12, 22], "split": 12, "chunk": 12, "slightli": [12, 15, 21], "account": [12, 15], "border": [12, 22], "latter": 12, "cudastreamwaitev": 12, "num_stream": 12, "clarifi": 12, "fit": [12, 18], "choic": [12, 14], "grid_size_x": 12, "grid_size_i": 12, "cudamemcpytosymbol": 12, "upload": 12, "yourself": [12, 22], "spent": [12, 22], "relat": [13, 16, 23], "famili": 13, "launcher": 13, "kt": [13, 20], "easiest": 13, "toolkit": [13, 14], "intend": 13, "Or": [13, 14], "vector_add": [13, 18, 19, 21], "10000000": 13, "512": [13, 19], "research": 13, "cite": 13, "paper": 13, "significantli": [13, 15, 17], "articl": [13, 19], "author": 13, "ben": 13, "van": 13, "werkhoven": 13, "titl": 13, "auto": [13, 15, 17, 18, 21, 22, 23], "journal": 13, "year": 13, "2019": 13, "volum": 13, "90": 13, "347": 13, "358": 13, "url": 13, "www": 13, "sciencedirect": 13, "scienc": 13, "pii": 13, "s0167739x18313359": 13, "doi": 13, "1016": 13, "2018": 13, "08": 13, "004": 13, "referenc": 13, "bayesian": [13, 18, 22], "willemsen2021bayesian": 13, "willemsen": [13, 18], "flori": 13, "jan": 13, "nieuwpoort": 13, "rob": 13, "workshop": 13, "pmb": 13, "supercomput": 13, "sc21": 13, "2021": 13, "arxiv": 13, "ab": 13, "2111": 13, "14991": 13, "difficulti": 13, "schoonhoven2022benchmark": 13, "schoonhoven": 13, "richard": 13, "batenburg": 13, "joost": 13, "ieee": 13, "transact": 13, "evolutionari": 13, "2022": 13, "consumpt": [13, 15, 17], "schoonhoven2022go": 13, "veenboer": 13, "bram": 13, "green": 13, "effici": [13, 15, 17], "steer": 13, "sc22": 13, "2211": 13, "07260": 13, "comprehens": 14, "recommend": [14, 20], "download": 14, "repo": 14, "continuum": 14, "io": 14, "miniconda3": 14, "x86_64": 14, "sh": 14, "newer": [14, 17], "nativ": 14, "prefix": 14, "home": 14, "pythonpath": 14, "bind": [14, 17], "older": 14, "troubl": 14, "retri": 14, "wiki": 14, "tiker": 14, "net": 14, "amd": [14, 17], "app": 14, "sdk": 14, "intel": 14, "appl": 14, "beignet": 14, "stack": 14, "altern": [14, 22], "navig": 14, "benvanwerkhoven": 14, "differenti": [14, 18, 22], "chanc": [14, 18, 21], "algebra": 15, "frequent": 15, "programm": [15, 17], "row": 15, "column": 15, "squar": 15, "matric": 15, "matmul_na": 15, "width": 15, "matmul_kernel": 15, "height": 15, "Of": 15, "solut": [15, 17], "realiti": 15, "contant": 15, "denot": [15, 19, 22], "sensibl": 15, "pick": 15, "word": 15, "warpsiz": 15, "namelijk": 15, "stand": 15, "briefli": 15, "figur": 15, "fifth": 15, "fourth": 15, "dramat": 15, "profil": 15, "pretti": 15, "opportun": 15, "realiz": 15, "collabor": 15, "bandwidth": 15, "techniqu": 15, "submatric": 15, "proce": 15, "matmul_shar": 15, "sa": 15, "sb": 15, "kb": 15, "outer": 15, "inner": 15, "race": 15, "drastic": 15, "due": [15, 21, 22], "fortun": 15, "benefit": 15, "redund": 15, "distinct": 15, "1xn": 15, "usag": [15, 17], "occup": 15, "goe": 15, "down": 15, "matmul": 15, "newli": 15, "coupl": 15, "respect": [15, 17], "independ": 15, "yield": 15, "discontinu": 15, "room": 15, "impos": 15, "report": [16, 17, 22, 23], "possibli": [16, 22], "_flop": 16, "total_flop": 16, "ps_energi": [16, 17, 23], "occur": [16, 22], "exhaust": 16, "brute": [16, 18, 19], "forc": [16, 18, 19, 21], "maxim": [16, 22], "boolean": [16, 17, 22], "facilit": 17, "layer": 17, "act": 17, "hook": 17, "pattern": 17, "subscrib": 17, "benchmarkobserv": 17, "overwritten": [17, 22], "extend": 17, "mandatori": 17, "get_result": 17, "aggreg": 17, "after_finish": 17, "after_start": 17, "before_start": 17, "register_configur": 17, "register_devic": 17, "variou": [17, 19], "registerobserv": 17, "track": 17, "num_reg": 17, "current_modul": 17, "powersensor2": 17, "pcie": 17, "intercept": 17, "sensor": 17, "transmit": 17, "usb": 17, "connect": 17, "advantag": 17, "instantan": 17, "frequenc": 17, "khz": 17, "pybind11": 17, "powersensor": [17, 23], "ps_power": [17, 23], "joul": [17, 23], "watt": [17, 23], "ttyacm0": 17, "core": 17, "voltag": 17, "thin": 17, "wrapper": [17, 21], "intricaci": 17, "friendli": 17, "mode": 17, "repeatedli": 17, "downsid": 17, "approach": 17, "save_al": 17, "nvidia_smi_fallback": 17, "use_locked_clock": 17, "continous_dur": 17, "monitor": 17, "clock": [17, 23], "power_read": [17, 23], "nvml_power": [17, 23], "nvml_energi": [17, 23], "core_freq": [17, 23], "mem_freq": [17, 23], "gr_voltag": 17, "ordin": 17, "identifi": 17, "smi": 17, "root": 17, "opt": 17, "amper": 17, "continuous_dur": 17, "common": [17, 21], "cap": 17, "popular": 17, "nvml_gr_clock": [17, 23], "nvml_mem_clock": [17, 23], "nvml_pwr_limit": [17, 23], "graphic": [17, 23], "jetson": 17, "rapl": 17, "xilinx": 17, "pmt": 17, "astron": 17, "nl": 17, "rd": 17, "meter": 17, "arduino": 17, "_energi": 17, "_power": 17, "acceler": 18, "prohibit": 18, "slow": 18, "wast": 18, "basin": [18, 22], "hop": [18, 22], "dual": [18, 22], "anneal": [18, 22], "evolut": [18, 22], "firefli": [18, 22], "genet": [18, 22], "greedi": [18, 22], "local": [18, 22], "multi": [18, 22], "particl": [18, 22], "swarm": [18, 22], "mechan": 18, "overrid": 18, "time_limit": [18, 22], "uniqu": [18, 22], "count": 18, "searchspac": 18, "runner": 18, "nelder": 18, "mead": 18, "powel": 18, "cg": 18, "bfg": 18, "l": 18, "tnc": 18, "cobyla": 18, "slsqp": 18, "reject": 18, "thesi": 18, "generate_normalized_param_dict": 18, "denorm": 18, "normalize_parameter_spac": 18, "param_spac": 18, "prune_parameter_spac": 18, "normalize_dict": 18, "prune": 18, "hyperparamet": 18, "popul": 18, "best1bin": 18, "best1exp": 18, "rand1exp": 18, "randtobest1exp": 18, "best2exp": 18, "rand2exp": 18, "randtobest1bin": 18, "best2bin": 18, "rand2bin": 18, "rand1bin": 18, "popsiz": 18, "maxit": 18, "constr": 18, "compute_intens": 18, "fun": 18, "intens": 18, "distance_to": 18, "euclidian": 18, "move_toward": 18, "alpha": 18, "toward": 18, "b0": 18, "attract": 18, "gamma": 18, "light": 18, "absorpt": 18, "coeffici": 18, "disruptive_uniform_crossov": 18, "dna1": 18, "dna2": 18, "disrupt": 18, "uniform": 18, "crossov": 18, "uniformli": 18, "gene": 18, "children": 18, "guarante": 18, "parent": 18, "mutat": 18, "dna": 18, "mutation_ch": 18, "single_point_crossov": 18, "single_point": 18, "two_point": 18, "disruptive_uniform": 18, "two_point_crossov": 18, "uniform_crossov": 18, "weighted_choic": 18, "probabl": [18, 22], "il": 18, "neighbor": 18, "ham": 18, "adjac": 18, "greedy": 18, "soon": 18, "no_improv": 18, "exce": 18, "50": 18, "random_walk": 18, "hillclimb": 18, "travers": 18, "inertia": 18, "c1": 18, "cognit": 18, "c2": 18, "social": 18, "fraction": 18, "acceptance_prob": 18, "old_cost": 18, "new_cost": 18, "modif": [18, 20], "po": 18, "t_min": 18, "001": 18, "995": 18, "vector_add_kernel": 19, "wise": 19, "1000000": [19, 21], "recogn": 19, "alright": 19, "portabl": 20, "stick": 20, "pointer": 20, "primit": 20, "lead": 20, "ineffici": 20, "situat": 20, "scientif": 20, "sens": 20, "experiment": 20, "pack": 20, "consult": 20, "create_receive_spec_struct": 20, "0l": 20, "pad": 20, "8byte": 20, "packstr": 20, "iiiiiiiiiiippi": 20, "fffi": 20, "nsampl": 20, "nsamplesiq": 20, "nslowtimesampl": 20, "nchannel": 20, "ntx": 20, "nrepeat": 20, "nfasttimesampl": 20, "rfsize": 20, "mnrow": 20, "mnrowsiq": 20, "nactivechannel": 20, "isiq": 20, "fsiq": 20, "fc": 20, "nbuffer": 20, "frombuff": 20, "len": 20, "receive_spec": 20, "bf": 20, "rf": 20, "recon": 20, "length": 20, "slight": 20, "matlab": 21, "typenam": 21, "my_typ": 21, "regardless": 21, "demot": 21, "rewrit": 21, "real": 21, "risk": 21, "seper": 21, "grid_div_z": 22, "06": 22, "log": 22, "auxilliari": 22, "safer": 22, "notat": 22, "divison": 22, "treat": 22, "warp": 22, "empti": 22, "kepler": 22, "plu": 22, "filter_mod": 22, "address_mod": 22, "clamp": 22, "mirror": 22, "axi": 22, "normalized_coordin": 22, "emtpi": 22, "get_local_s": 22, "satisfi": 22, "000001": 22, "ref": 22, "basinhop": 22, "bayes_opt": 22, "diff_evo": 22, "firefly_algorithm": 22, "genetic_algorithm": 22, "greedy_il": 22, "greedy_ml": 22, "ml": 22, "ordered_greedy_ml": 22, "pso": 22, "simulated_ann": 22, "sort": 22, "resourc": 22, "persist": 22, "consol": 22, "info": 22, "summar": 22, "store_result": 22, "results_filenam": 22, "typicali": 22, "percentag": 22, "create_device_target": 22, "header_filenam": 22, "target": 22, "dtarget_gpu": 22, "name_of_gpu": 22, "chosen": 22, "block_size_": 23, "grid_size_": 23, "compiler_opt_": 23, "loop_unroll_factor_": 23, "nvml_": 23, "nvmlobserv": 23}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner": [[22, 2, 1, "", "create_device_targets"], [22, 2, 1, "", "run_kernel"], [22, 2, 1, "", "store_results"], [22, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[17, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[17, 1, 1, "", "after_finish"], [17, 1, 1, "", "after_start"], [17, 1, 1, "", "before_start"], [17, 1, 1, "", "during"], [17, 1, 1, "", "get_results"], [17, 1, 1, "", "register_configuration"], [17, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[17, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[17, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[17, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[18, 3, 0, "-", "basinhopping"], [18, 3, 0, "-", "bayes_opt"], [18, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [18, 3, 0, "-", "diff_evo"], [18, 3, 0, "-", "dual_annealing"], [18, 3, 0, "-", "firefly_algorithm"], [18, 3, 0, "-", "genetic_algorithm"], [18, 3, 0, "-", "greedy_ils"], [18, 3, 0, "-", "greedy_mls"], [18, 3, 0, "-", "minimize"], [18, 3, 0, "-", "mls"], [18, 3, 0, "-", "ordered_greedy_mls"], [18, 3, 0, "-", "pso"], [18, 3, 0, "-", "random_sample"], [18, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[18, 2, 1, "", "generate_normalized_param_dicts"], [18, 2, 1, "", "normalize_parameter_space"], [18, 2, 1, "", "prune_parameter_space"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[18, 0, 1, "", "Firefly"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[18, 1, 1, "", "compute_intensity"], [18, 1, 1, "", "distance_to"], [18, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[18, 2, 1, "", "disruptive_uniform_crossover"], [18, 2, 1, "", "mutate"], [18, 2, 1, "", "single_point_crossover"], [18, 2, 1, "", "tune"], [18, 2, 1, "", "two_point_crossover"], [18, 2, 1, "", "uniform_crossover"], [18, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[18, 2, 1, "", "acceptance_prob"], [18, 2, 1, "", "neighbor"], [18, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 14, 21], "cuda": [0, 14, 15], "featur": [0, 2], "support": 0, "usag": [0, 13], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 13], "kernel": [2, 7, 8, 9, 10, 11, 13, 15, 21], "tuner": [2, 7, 8, 9, 10, 11, 13], "document": [2, 3, 6, 13, 22], "guid": [2, 3, 14], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 7, 8, 9, 10, 12], "develop": 3, "environ": 3, "local": [3, 8], "setup": 3, "cluster": 3, "run": [3, 9], "test": [3, 4], "build": 3, "convolut": [4, 10], "2d": 4, "exampl": [4, 10, 13, 21], "implement": [4, 7, 8, 9], "tune": [4, 7, 8, 9, 11, 12, 15, 16, 17], "more": 4, "tunabl": 4, "paramet": [4, 9, 11, 17, 23], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 18], "kernel_tun": [6, 18], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 14], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 14], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 14], "hipfunct": 6, "util": 6, "function": 6, "diffus": [7, 8, 9], "python": [7, 8, 9, 14], "comput": [7, 8, 9], "gpu": [7, 8, 9, 11], "auto": [7, 8, 9], "us": [7, 8, 9, 11, 15, 20], "share": [7, 8, 9, 15], "memori": [7, 8, 9, 15], "tile": [7, 8, 9], "store": [7, 8], "result": [7, 8], "tutori": [8, 9], "from": [8, 9], "physic": [8, 9], "best": 9, "product": 9, "c": 9, "vector": 10, "add": 10, "stencil": 10, "matrix": [10, 15], "multipl": [10, 15], "py": 10, "sepconv": 10, "convolution_correct": 10, "convolution_stream": 10, "reduct": 10, "spars": 10, "point": 10, "polygon": 10, "expdist": 10, "gener": 10, "3d": 11, "grid": 11, "let": 11, "": 11, "start": [11, 19], "cpu": 11, "move": 11, "optim": [11, 18], "host": 12, "number": 12, "stream": 12, "quick": 13, "instal": [13, 14], "citat": 13, "packag": 14, "other": 14, "pyopencl": 14, "pyhip": 14, "git": 14, "version": 14, "depend": 14, "naiv": 15, "increas": 15, "work": 15, "per": 15, "thread": 15, "metric": 16, "object": 16, "observ": 17, "powersensorobserv": 17, "nvmlobserv": 17, "execut": 17, "nvml": 17, "pmtobserv": 17, "basinhop": 18, "bayes_opt": 18, "brute_forc": 18, "diff_evo": 18, "dual_ann": 18, "firefly_algorithm": 18, "genetic_algorithm": 18, "greedy_il": 18, "greedy_ml": 18, "minim": 18, "ml": 18, "ordered_greedy_ml": 18, "pso": 18, "random_sampl": 18, "simulated_ann": 18, "get": 19, "struct": 20, "templat": 21, "select": 21, "api": 22, "vocabulari": 23}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [13, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Development environment": [[3, "development-environment"]], "Local setup": [[3, "local-setup"]], "Cluster setup": [[3, "cluster-setup"]], "Running tests": [[3, "running-tests"]], "Building documentation": [[3, "building-documentation"]], "Convolution": [[4, "Convolution"], [10, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Diffusion": [[7, "Diffusion"], [7, "id1"], [8, "Diffusion"], [9, "Diffusion"]], "Python implementation": [[7, "Python-implementation"], [8, "Python-implementation"], [9, "Python-implementation"]], "Computing on the GPU": [[7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[7, "Using-Shared-Memory"]], "Tiling GPU Code": [[7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"]], "Storing the results": [[7, "Storing-the-results"], [8, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[8, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [9, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[8, "Using-Shared-(local)-Memory"]], "Using shared memory": [[9, "Using-shared-memory"], [15, "Using-shared-memory"]], "Using the best parameters in a production run": [[9, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[9, "Python-run"]], "C run": [[9, "C-run"]], "Kernel Tuner Examples": [[10, "kernel-tuner-examples"]], "Vector Add": [[10, "vector-add"]], "Stencil": [[10, "stencil"]], "Matrix Multiplication": [[10, "matrix-multiplication"]], "convolution.py": [[10, "convolution-py"]], "sepconv.py": [[10, "sepconv-py"]], "convolution_correct.py": [[10, "convolution-correct-py"]], "convolution_streams.py": [[10, "convolution-streams-py"]], "Reduction": [[10, "reduction"]], "Sparse Matrix Vector Multiplication": [[10, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[10, "point-in-polygon"]], "ExpDist": [[10, "expdist"]], "Code Generator": [[10, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[11, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[11, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[11, "Let's-move-to-the-GPU"]], "Tune the kernel": [[11, "Tune-the-kernel"]], "Using the optimized parameters": [[11, "Using-the-optimized-parameters"]], "Tuning Host Code": [[12, "tuning-host-code"]], "Tuning the number of streams": [[12, "tuning-the-number-of-streams"]], "Quick install": [[13, "quick-install"]], "Example usage": [[13, "example-usage"]], "Citation": [[13, "citation"]], "Installation": [[14, "installation"]], "Python": [[14, "python"]], "Installing Python Packages": [[14, "installing-python-packages"]], "CUDA and PyCUDA": [[14, "cuda-and-pycuda"]], "Other CUDA Backends": [[14, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[14, "opencl-and-pyopencl"]], "HIP and PyHIP": [[14, "hip-and-pyhip"]], "Installing the git version": [[14, "installing-the-git-version"]], "Dependencies for the guides": [[14, "dependencies-for-the-guides"]], "Matrix multiplication": [[15, "Matrix-multiplication"]], "Naive CUDA kernel": [[15, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[15, "Tuning-a-naive-kernel"]], "Increase work per thread": [[15, "Increase-work-per-thread"]], "Metrics and Objectives": [[16, "metrics-and-objectives"]], "Metrics": [[16, "metrics"]], "Tuning Objectives": [[16, "tuning-objectives"]], "Observers": [[17, "observers"]], "PowerSensorObserver": [[17, "powersensorobserver"]], "NVMLObserver": [[17, "nvmlobserver"]], "Tuning execution parameters with NVML": [[17, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[17, "pmtobserver"]], "Optimization strategies": [[18, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[19, "getting-started"]], "Using structs": [[20, "using-structs"]], "Templated kernels": [[21, "templated-kernels"]], "Example": [[21, "example"]], "Selecting a backend": [[21, "selecting-a-backend"]], "API Documentation": [[22, "api-documentation"]], "Parameter Vocabulary": [[23, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [18, "module-kernel_tuner.strategies.basinhopping"], [18, "module-kernel_tuner.strategies.bayes_opt"], [18, "module-kernel_tuner.strategies.brute_force"], [18, "module-kernel_tuner.strategies.diff_evo"], [18, "module-kernel_tuner.strategies.dual_annealing"], [18, "module-kernel_tuner.strategies.firefly_algorithm"], [18, "module-kernel_tuner.strategies.genetic_algorithm"], [18, "module-kernel_tuner.strategies.greedy_ils"], [18, "module-kernel_tuner.strategies.greedy_mls"], [18, "module-kernel_tuner.strategies.minimize"], [18, "module-kernel_tuner.strategies.mls"], [18, "module-kernel_tuner.strategies.ordered_greedy_mls"], [18, "module-kernel_tuner.strategies.pso"], [18, "module-kernel_tuner.strategies.random_sample"], [18, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[17, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[17, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[17, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[17, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[18, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[18, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[18, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[18, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[18, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[18, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[18, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[18, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[18, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[18, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[18, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[22, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[22, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["backends", "cache_files", "contents", "contributing", "convolution", "correctness", "design", "diffusion", "diffusion_opencl", "diffusion_use_optparam", "examples", "grid3d", "hostcode", "index", "install", "matrix_multiplication", "metrics", "observers", "optimization", "quickstart", "structs", "templates", "user-api", "vocabulary"], "filenames": ["backends.rst", "cache_files.rst", "contents.rst", "contributing.rst", "convolution.ipynb", "correctness.rst", "design.rst", "diffusion.ipynb", "diffusion_opencl.ipynb", "diffusion_use_optparam.ipynb", "examples.rst", "grid3d.ipynb", "hostcode.rst", "index.rst", "install.rst", "matrix_multiplication.ipynb", "metrics.rst", "observers.rst", "optimization.rst", "quickstart.rst", "structs.rst", "templates.rst", "user-api.rst", "vocabulary.rst"], "titles": ["Backends", "Cache files", "The Kernel Tuner documentation", "Contribution guide", "Convolution", "Correctness Verification", "Design documentation", "Diffusion", "Tutorial: From physics to tuned GPU kernels", "Tutorial: From physics to tuned GPU kernels", "Kernel Tuner Examples", "3D Grid on GPU with Kernel Tuner", "Tuning Host Code", "The Kernel Tuner documentation", "Installation", "Matrix multiplication", "Metrics and Objectives", "Observers", "Optimization strategies", "Getting Started", "Using structs", "Templated kernels", "API Documentation", "Parameter Vocabulary"], "terms": {"kernel": [0, 1, 3, 4, 5, 6, 12, 14, 16, 17, 18, 19, 20, 22, 23], "tuner": [0, 1, 3, 4, 5, 6, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "implement": [0, 5, 6, 10, 11, 16, 17, 18, 22], "multipl": [0, 2, 6, 12, 17, 21, 22], "one": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "opencl": [0, 3, 4, 7, 8, 9, 10, 12, 13, 15, 22], "hip": [0, 3, 13, 22], "gener": [0, 3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 20, 22, 23], "select": [0, 3, 4, 6, 7, 8, 9, 11, 14, 15, 17, 18, 22], "i": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "most": [0, 3, 6, 7, 8, 9, 10, 12, 13, 15, 17, 18, 19, 20, 22], "case": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 19, 20, 22], "automat": [0, 3, 4, 7, 8, 9, 11, 12, 15, 21, 22], "done": [0, 4, 14, 16, 17], "base": [0, 3, 6, 16, 17, 21, 22], "": [0, 3, 4, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22], "program": [0, 3, 5, 7, 8, 9, 12, 15, 20, 21], "languag": [0, 6, 9, 12, 15, 20, 22], "sometim": [0, 3, 7, 8, 9, 20], "you": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22, 23], "ll": [0, 4, 7, 8, 9, 14, 15], "want": [0, 5, 9, 11, 12, 14, 15, 17, 19, 22, 23], "specif": [0, 4, 6, 7, 8, 9, 10, 11, 16, 17, 18, 22], "choos": [0, 7, 8, 9, 15, 18, 22], "pycuda": [0, 3, 7, 9, 11, 12, 17, 21], "default": [0, 3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 21, 22], "It": [0, 3, 4, 6, 7, 8, 9, 12, 14, 15, 17, 21, 22], "compar": [0, 4, 5, 7, 8, 9, 11, 15, 16, 17], "complet": [0, 1, 4], "cupi": [0, 3, 12, 14, 17, 21, 22], "becaus": [0, 4, 5, 7, 8, 9, 12, 14, 15, 16, 21, 23], "ident": 0, "includ": [0, 3, 4, 5, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "here": [0, 4, 10, 11, 12, 14, 15, 17, 22], "well": [0, 7, 8, 9, 11, 15, 17, 22], "To": [0, 3, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 20, 21, 22], "us": [0, 1, 2, 3, 4, 5, 6, 10, 12, 13, 14, 16, 17, 18, 19, 21, 22, 23], "nvidia": [0, 3, 6, 14, 15, 17, 21], "gpu": [0, 3, 4, 5, 6, 10, 12, 13, 15, 17, 19, 20, 22, 23], "see": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 19, 21, 22], "http": [0, 3, 6, 13, 14, 17], "github": [0, 3, 4, 7, 8, 9, 11, 14, 15], "com": [0, 3, 6, 13, 14], "jatinx": [0, 14], "nv": 0, "while": [0, 1, 4, 6, 7, 8, 9, 10, 15, 17, 18], "expect": [0, 3, 4, 5, 6, 7, 8, 9, 15, 17, 22], "all": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 22], "input": [0, 4, 5, 7, 8, 9, 10, 12, 15, 16, 19, 20, 22], "output": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 19, 22, 23], "numpi": [0, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 19, 20, 21, 22], "arrai": [0, 4, 5, 6, 7, 8, 9, 11, 12, 19, 20, 22], "also": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "argument": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "thi": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "give": [0, 7, 8, 9, 18], "user": [0, 3, 4, 5, 6, 8, 10, 14, 15, 16, 17, 18, 21, 22], "more": [0, 3, 5, 6, 7, 8, 9, 13, 14, 15, 16, 17, 19, 21, 22], "control": [0, 7, 8, 9, 17, 18, 22], "over": [0, 6, 7, 8, 9, 14, 15, 17, 18], "how": [0, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 19, 20, 21, 22], "memori": [0, 4, 6, 10, 12, 17, 20, 22, 23], "handl": [0, 12, 22], "check": [0, 3, 5, 6, 7, 8, 9, 12, 15], "dure": [0, 1, 6, 7, 8, 9, 11, 17, 22], "verif": [0, 2, 10, 22], "can": [0, 1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "happen": [0, 1, 3, 4, 15, 19], "entir": [0, 3, 6, 7, 8, 9, 15, 18, 22], "when": [0, 1, 3, 4, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23], "onli": [0, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 18, 20, 22], "textur": [0, 6, 22], "c": [0, 3, 4, 6, 10, 12, 13, 14, 15, 19, 21, 22], "signatur": [0, 4, 6], "With": [0, 11, 12], "other": [0, 1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 22, 23], "requir": [0, 3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21], "ha": [0, 3, 4, 6, 7, 8, 9, 12, 15, 17, 18, 22], "extern": [0, 17, 21], "linkag": [0, 21], "If": [0, 3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 20, 22], "code": [0, 2, 4, 6, 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "wrap": [0, 6, 19, 21, 22], "an": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22], "block": [0, 4, 6, 7, 8, 9, 10, 11, 14, 15, 16, 19, 22, 23], "which": [0, 3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "mai": [0, 3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "caus": [0, 7, 8, 9], "issu": [0, 20], "contain": [0, 1, 4, 6, 7, 8, 9, 11, 12, 15, 17, 18, 21, 22], "cannot": [0, 3, 7, 8, 9, 17], "have": [0, 1, 3, 4, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "present": [0, 3, 15], "header": [0, 22], "file": [0, 2, 3, 4, 6, 7, 8, 10, 12, 15, 18, 19, 21, 22], "As": [0, 1, 4, 7, 8, 9, 11, 14, 15, 17], "detail": [0, 6, 14, 22], "further": [0, 7, 8, 9, 14, 15], "templat": [0, 2, 11], "ar": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23], "fulli": [0, 3, 14], "limit": [0, 3, 4, 6, 7, 8, 9, 10, 15, 17, 18, 21, 22, 23], "python": [0, 3, 4, 6, 10, 11, 12, 15, 17, 19, 20, 21, 22], "benchmark": [0, 1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 22, 23], "observ": [0, 2, 6, 16, 22, 23], "constant": [0, 4, 6, 7, 8, 9, 10, 12, 15, 18, 22], "dynam": [0, 6, 22], "share": [0, 4, 6, 22], "anoth": [0, 7, 8, 9, 12, 15, 16, 18, 22], "import": [0, 4, 5, 7, 8, 9, 11, 14, 15, 16, 19, 20, 21], "differ": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 22], "between": [0, 7, 8, 9, 12, 14, 15, 16, 18, 22], "The": [0, 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22], "tabl": 0, "below": [0, 3, 9, 10, 11, 12, 14, 15, 16, 17, 18, 20], "list": [0, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 20, 22], "packag": [0, 3], "pyhip": [0, 6], "interfac": [0, 4, 5, 12, 14, 17, 18, 20, 22], "lang": [0, 6, 10, 12, 21, 22], "nvcuda": 0, "nvcc": [0, 6], "nvrtc": [0, 6, 21], "hiprtc": 0, "A": [1, 3, 4, 6, 13, 14, 15, 17, 18, 22], "veri": [1, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21], "featur": [1, 4, 5, 10, 14, 16, 17, 19, 21, 22], "abil": 1, "store": [1, 3, 4, 6, 9, 15, 17, 19, 22], "result": [1, 3, 4, 5, 6, 9, 11, 15, 16, 17, 18, 19, 22, 23], "tune": [1, 2, 5, 6, 10, 13, 14, 18, 19, 21, 22, 23], "enabl": [1, 17, 18, 20, 21], "pass": [1, 3, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16, 17, 18, 19, 21, 22], "ani": [1, 3, 4, 6, 7, 8, 9, 12, 15, 16, 17, 18, 20, 21, 22, 23], "filenam": [1, 4, 6, 10, 15, 19, 22], "option": [1, 3, 4, 5, 6, 7, 8, 9, 10, 12, 14, 15, 16, 17, 18, 21, 22, 23], "tune_kernel": [1, 4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 16, 18, 19, 20, 21, 22], "individu": [1, 17, 18], "configur": [1, 4, 6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 22], "append": [1, 6, 14, 22], "run": [1, 4, 5, 6, 7, 8, 11, 12, 14, 15, 17, 18, 22], "allow": [1, 3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 18, 21, 22], "restart": [1, 3, 7, 8, 9, 18], "session": [1, 3, 6, 18], "from": [1, 3, 4, 5, 6, 7, 10, 11, 12, 14, 15, 17, 18, 20, 21, 22], "exist": [1, 6, 22], "should": [1, 3, 4, 5, 6, 7, 8, 9, 12, 15, 16, 17, 19, 22], "someth": [1, 4, 7, 8, 9, 15], "termin": [1, 14], "previou": [1, 3, 7, 8, 9, 18, 22], "befor": [1, 3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 22], "had": [1, 4], "quit": [1, 7, 8, 9, 11, 15, 21], "often": [1, 7, 8, 9, 17], "hpc": 1, "environ": [1, 4, 6, 14, 18, 22], "job": 1, "reserv": [1, 8, 23], "out": [1, 3, 4, 5, 11, 14, 15], "number": [1, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 22, 23], "simul": [1, 6, 9, 13, 18, 20, 22], "visual": [1, 3, 15], "optim": [1, 2, 4, 5, 6, 7, 8, 9, 12, 13, 15, 16, 17, 22], "strategi": [1, 2, 4, 13, 16, 22], "start": [1, 2, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 22], "call": [1, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 21, 22], "full": [1, 3, 6, 17, 19], "search": [1, 4, 6, 10, 13, 15, 16, 18, 22], "space": [1, 3, 4, 5, 6, 11, 12, 15, 16, 18, 22], "true": [1, 4, 5, 6, 7, 8, 9, 12, 15, 17, 18, 22], "creat": [1, 3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 19, 20, 22], "even": [1, 3, 7, 8, 9, 12, 15, 18], "work": [1, 3, 4, 6, 7, 8, 9, 14, 16, 18, 21, 22], "still": [1, 3, 5, 15], "new": [1, 3, 6, 7, 8, 9, 18, 22], "come": [1, 6, 7, 8, 9, 15, 17, 21], "thei": [1, 3, 6, 7, 8, 9, 10, 15, 16], "stream": [1, 6, 7, 8, 9], "pleas": [1, 3, 4, 10, 13, 14, 17, 19, 20, 22], "dashboard": [1, 13], "introduct": 2, "instal": [2, 3, 4, 7, 8, 9, 11, 12, 15, 17, 19], "get": [2, 4, 6, 7, 8, 9, 11, 14, 15], "convolut": [2, 5, 12, 15], "diffus": 2, "matrix": 2, "exampl": [2, 3, 5, 6, 7, 8, 9, 12, 14, 15, 16, 17, 18, 19, 20, 22], "backend": [2, 3, 12, 17], "cach": [2, 3, 6, 7, 8, 9, 14, 15, 18, 22], "correct": [2, 3, 12, 20, 22], "host": [2, 3, 6, 8, 9, 10, 17, 20, 21, 22], "struct": 2, "metric": [2, 4, 6, 10, 15, 22], "object": [2, 4, 5, 6, 7, 8, 9, 18, 22], "api": [2, 4, 6], "paramet": [2, 5, 6, 7, 8, 10, 12, 15, 16, 18, 19, 20, 21, 22], "vocabulari": [2, 17, 19], "design": [2, 3, 7, 8, 9, 17], "contribut": 2, "thank": 3, "consid": [3, 11, 13, 15, 22], "Not": [3, 6], "help": [3, 21], "u": [3, 4, 7, 8, 9], "improv": [3, 6, 7, 8, 9, 15, 18, 22], "about": [3, 4, 6, 7, 8, 9, 13, 15, 17, 18, 19, 22], "problem": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "ensur": [3, 5, 7, 8, 9, 12, 14, 17, 20], "follow": [3, 4, 5, 6, 7, 8, 9, 11, 12, 13, 14, 15, 17, 18, 21, 22], "describ": [3, 4, 6, 12, 17, 20], "what": [3, 4, 5, 6, 7, 8, 9, 12, 15, 17, 19, 20, 21, 22, 23], "possibl": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "minim": [3, 16, 21, 22], "reproduc": 3, "actual": [3, 4, 5, 6, 7, 8, 9, 11, 15, 21], "error": [3, 4, 5, 6, 12, 15, 21], "print": [3, 4, 6, 7, 8, 9, 11, 15, 22], "version": [3, 4, 15, 17, 22], "cuda": [3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 17, 19, 20, 21, 22], "compil": [3, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 21, 22, 23], "applic": [3, 4, 7, 8, 9, 10, 11, 12, 13, 16, 17, 20, 21, 22], "For": [3, 4, 5, 6, 7, 8, 9, 11, 13, 14, 17, 19, 20, 22], "propos": 3, "chang": [3, 11, 17, 22], "addit": [3, 4, 7, 8, 9, 14, 16, 19], "signific": 3, "first": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22], "discuss": [3, 6], "Then": [3, 7, 8, 9, 11, 13, 14, 21], "fork": 3, "repositori": [3, 4, 7, 8, 9, 11, 13, 14, 15], "branch": 3, "per": [3, 4, 6, 7, 8, 9, 11, 16, 17, 22], "pull": 3, "request": [3, 17, 22], "googl": 3, "style": 3, "sphinxdoc": 3, "docstr": [3, 6], "modul": [3, 6, 12, 17], "public": [3, 13], "function": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 20, 21, 22], "up": [3, 4, 6, 7, 8, 9, 14, 15, 19, 22], "date": 3, "written": [3, 21], "unit": [3, 6], "your": [3, 4, 7, 8, 9, 11, 12, 13, 14, 17, 20, 22], "nox": 3, "do": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 22], "hardwar": [3, 7, 8, 9, 11, 17, 18, 19], "skip": [3, 4, 7, 8, 9, 22], "produc": [3, 5], "same": [3, 4, 5, 6, 7, 8, 9, 11, 12, 17, 19, 22], "better": [3, 7, 8, 9], "entri": [3, 6, 7, 8], "changelog": 3, "md": 3, "doubt": 3, "where": [3, 4, 5, 6, 7, 8, 9, 15, 16, 17, 20, 21, 22], "put": [3, 6, 7, 8, 9], "look": [3, 4, 6, 7, 8, 9, 11, 14, 15, 21], "regard": [3, 6, 18], "step": [3, 7, 8, 9, 14, 15, 16, 18, 21], "set": [3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 17, 18, 19, 21, 22, 23], "sudo": [3, 14], "access": [3, 4, 7, 8, 9, 11, 17, 20], "e": [3, 14, 16, 17, 18, 22], "g": [3, 14, 16, 17], "devic": [3, 4, 5, 7, 8, 9, 10, 12, 17, 21, 22], "clone": [3, 4, 7, 8, 9, 11, 14, 15], "git": [3, 17], "desir": 3, "locat": [3, 5, 11, 17], "kerneltun": [3, 13], "kernel_tun": [3, 4, 5, 7, 8, 9, 11, 12, 13, 14, 15, 17, 19, 20, 21, 22, 23], "cd": [3, 14], "prepar": [3, 6, 7, 8, 9], "system": [3, 13, 14, 17], "On": [3, 7, 8, 9, 22], "ubuntu": 3, "apt": 3, "updat": [3, 6], "upgrad": 3, "y": [3, 4, 6, 7, 8, 9, 11, 12, 15, 22], "make": [3, 4, 7, 8, 9, 11, 13, 14, 15, 17, 20, 21], "essenti": [3, 4], "libssl": 3, "dev": [3, 14, 17], "zlib1g": 3, "libbz2": 3, "libreadlin": 3, "libsqlite3": 3, "wget": [3, 14], "curl": [3, 14], "llvm": 3, "libncurses5": 3, "libncursesw5": 3, "xz": 3, "util": [3, 15], "tk": 3, "libffi": 3, "liblzma": 3, "openssl": 3, "pyenv": 3, "linux": [3, 14], "bash": [3, 14], "rememb": [3, 4, 7, 8, 9, 15], "add": [3, 4, 6, 7, 8, 9, 12, 15, 17, 18], "bash_profil": 3, "bashrc": 3, "specifi": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 19, 20, 21, 22, 23], "maco": 3, "brew": 3, "after": [3, 4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 22], "shell": 3, "some": [3, 4, 6, 7, 8, 9, 14, 15, 16, 17, 18, 19, 20, 21, 22], "need": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 21, 22], "libgdbm": 3, "libnss3": 3, "lzma": 3, "3": [3, 5, 7, 8, 9, 11, 12, 14, 15, 18, 22], "8": [3, 4, 6, 7, 8, 9, 11, 14, 15, 17], "9": [3, 4, 5, 7, 8, 9, 12], "10": [3, 7, 8, 9, 13, 18], "11": [3, 7, 8, 9], "reason": [3, 4, 6, 20, 22], "we": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 20, 21], "re": [3, 4, 7, 8, 9, 11, 15], "oppos": 3, "just": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15], "so": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 18, 19, 21, 22], "against": [3, 5, 6], "support": [3, 4, 6, 7, 8, 9, 12, 14, 17, 18, 21, 22, 23], "found": [3, 4, 6, 13, 17, 18], "replac": [3, 4, 5, 6, 7, 8, 9, 11, 15, 22], "global": [3, 6, 7, 8, 9, 18], "virtualenv": 3, "virtual": [3, 14], "folder": 3, "whatev": [3, 6, 12, 18], "name": [3, 4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 19, 22, 23], "prefer": [3, 4, 6, 7, 9, 17, 22], "poetri": [3, 14], "ssl": [3, 14], "org": [3, 13, 14], "python3": [3, 14], "sure": [3, 4, 7, 8, 9, 13, 14, 15], "path": [3, 4, 17], "instruct": [3, 7, 8, 9, 10, 14, 15], "end": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 20], "export": 3, "plugin": 3, "self": [3, 6, 17, 18], "non": [3, 5], "depend": [3, 4, 5, 9, 10, 11, 13, 16, 22], "appli": [3, 7, 8, 9], "open": [3, 5, 7, 8, 12, 15], "take": [3, 4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 21, 22], "effect": [3, 4, 7, 8, 9, 22], "activ": 3, "pip": [3, 4, 7, 8, 13, 14, 15], "point": [3, 4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 22], "project": [3, 14], "extra": [3, 14, 21], "doc": [3, 4, 6, 7, 8, 9, 11, 14, 15], "leav": 3, "doe": [3, 5, 6, 7, 8, 9, 11, 12, 15, 17, 21, 22], "go": [3, 4, 7, 8, 9, 11, 13, 14, 15, 19], "necessari": [3, 5, 6, 7, 8, 9, 22], "conveni": [3, 7, 8, 9, 12, 22], "cuda11x": 3, "cuda12x": 3, "These": [3, 7, 8, 9, 11, 14, 15, 17, 21, 22], "current": [3, 4, 5, 6, 7, 8, 9, 14, 15, 17, 18, 22], "defin": [3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 16, 17, 21, 22], "part": [3, 7, 8, 9, 13, 14, 15, 16, 20, 22], "forget": [3, 11], "correctli": [3, 15], "ld_libary_path": 3, "cpath": 3, "pytest": 3, "except": [3, 6, 10], "been": [3, 4, 6, 7, 8, 9, 12, 15, 18], "left": [3, 6, 7, 8, 9, 11, 16], "gracefulli": 3, "note": [3, 4, 6, 7, 8, 9, 11, 13, 14, 15, 17, 20, 22], "driver": [3, 6, 7, 9, 11], "privileg": [3, 17], "read": [3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 22], "counter": [3, 17], "energi": [3, 13, 17, 18, 23], "measur": [3, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 22, 23], "cat": 3, "proc": 3, "param": [3, 4, 5, 6, 17, 18, 22], "grep": 3, "rmprofilingadminonli": 3, "1": [3, 4, 5, 7, 8, 9, 11, 12, 15, 17, 18, 22], "without": [3, 7, 8, 9, 11, 12, 17, 18], "conda": 3, "mamba": 3, "perform": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20, 22], "miniconda": [3, 14], "tradit": 3, "under": [3, 4, 13, 22], "quota": 3, "otherwis": [3, 6, 15, 22], "restrict": [3, 6, 10, 15, 21, 22], "disk": 3, "directori": [3, 4, 7, 8, 9, 11, 14, 15], "save": [3, 7, 8], "ad": [3, 7, 8, 9, 12, 22], "condarc": 3, "envs_dir": 3, "both": [3, 7, 8, 9, 10, 15], "via": [3, 18], "usual": [3, 17], "provid": [3, 5, 6, 7, 8, 9, 12, 21, 22], "exit": 3, "enter": [3, 4, 7, 8, 9, 11, 15], "avail": [3, 4, 7, 8, 9, 10, 11, 14, 17], "elsewher": 3, "variabl": [3, 6, 11, 14, 18, 22], "pip_cache_dir": 3, "dir": [3, 14], "xdg_cache_hom": 3, "continu": [3, 4, 6, 7, 8, 9, 14, 17, 18, 22], "n": [3, 5, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21], "forg": 3, "execut": [3, 4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 22], "config": [3, 6], "auto_activate_bas": 3, "fals": [3, 6, 17, 18, 22], "load": [3, 6], "unload": [3, 6], "rocm": [3, 14, 17], "inform": [3, 4, 6, 7, 8, 9, 13, 17, 18, 19, 22, 23], "like": [3, 4, 6, 7, 8, 9, 10, 11, 15, 18, 19, 20, 21, 22], "keyr": 3, "seemingli": 3, "weird": 3, "known": [3, 15], "m": [3, 7, 8, 9, 11], "disabl": 3, "verifi": [3, 5, 6, 10, 22], "miss": [3, 6, 22], "sync": [3, 20], "dry": 3, "node": [3, 18], "In": [3, 4, 5, 6, 7, 8, 9, 11, 12, 14, 15, 16, 17, 19, 20, 22, 23], "noxset": 3, "toml": 3, "venvbackend": 3, "2": [3, 4, 5, 7, 8, 9, 10, 11, 12, 15, 17, 18, 22], "anaconda": 3, "venv": 3, "alreadi": [3, 4, 6, 7, 8, 9, 14, 15, 22], "Be": [3, 7, 8, 9], "adjust": [3, 4], "envdir": 3, "particularli": [3, 4, 16], "diskquota": 3, "isol": [3, 21], "top": [3, 6, 11, 17, 22], "level": [3, 6, 17], "coverag": 3, "gigabyt": 3, "size": [3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 18, 19, 21, 22], "tight": 3, "diskspac": 3, "small": [3, 4, 7, 8, 9, 15], "remov": [3, 18], "each": [3, 4, 5, 6, 7, 8, 11, 15, 17, 18, 22], "ran": 3, "longer": [3, 4, 6, 16], "would": [3, 4, 7, 8, 9, 21], "command": [3, 14], "line": [3, 4, 7, 8, 9], "combin": [3, 4, 6, 7, 8, 9, 10, 11, 15, 17, 18, 19, 22], "compat": [3, 6, 14], "involv": 3, "especi": 3, "don": [3, 6, 7, 9, 11, 12, 22], "t": [3, 4, 6, 7, 8, 9, 11, 12, 14, 18, 21, 22], "break": [3, 21], "them": [3, 4, 9, 11, 12, 15], "capabl": [3, 6, 7, 8, 13, 15, 22], "hold": [3, 7, 8, 15, 19, 20, 22], "pyopencl": [3, 6, 8, 17], "invok": 3, "tab": 3, "studio": 3, "id": [3, 6, 17], "seen": [3, 4, 6, 15], "integr": [3, 21], "type": [3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 17, 18, 19, 20, 21, 22], "html": [3, 6], "page": [3, 4, 7, 8, 9, 10, 11, 13, 15, 16], "sourc": [3, 4, 6, 7, 8, 9, 11, 12, 14, 15, 17, 21, 22], "inspect": [3, 6, 17], "commit": 3, "brows": 3, "through": [3, 6, 7, 8, 9, 11, 13, 16, 17, 18, 22], "least": [3, 6], "those": [3, 4, 10, 14, 17], "pandoc": 3, "mac": 3, "onlin": 3, "built": [3, 17, 18, 20, 22], "action": 3, "correspond": [3, 4, 7, 8, 9, 11, 17, 18, 19], "master": 3, "latest": [3, 14], "last": [3, 6, 20], "releas": [3, 6], "stabl": 3, "publish": [3, 13], "process": [3, 4, 6, 7, 8, 9, 15, 16, 17, 18, 21], "again": [3, 4, 7, 8, 9, 11, 15], "autom": 3, "guid": [4, 7, 15, 16, 19], "meant": 4, "write": [4, 10, 11, 15, 21, 22], "script": [4, 6, 15, 20, 21], "simpl": [4, 6, 7, 8, 9, 10, 11, 12, 13, 15, 17, 18, 19, 20], "find": [4, 12, 15, 18, 22], "shortli": 4, "much": [4, 7, 8, 9, 11, 17, 21, 22], "reus": [4, 7, 8, 9, 15], "document": [4, 5, 7, 8, 9, 11, 14, 15, 20, 23], "jupyt": [4, 7, 8, 9, 11, 14, 15], "notebook": [4, 7, 8, 9, 11, 14, 15], "tutori": [4, 7, 11, 13, 14, 15], "readi": [4, 6, 7, 8, 9, 11, 15], "oper": [4, 7, 8, 9, 11, 12, 15, 16], "signal": [4, 23], "imag": [4, 7, 8, 9], "main": [4, 6, 11, 17, 19], "neural": 4, "network": 4, "deep": 4, "learn": 4, "comput": [4, 5, 6, 10, 11, 12, 13, 15, 18, 22], "linear": [4, 15, 22], "weight": [4, 18], "filter": [4, 5, 10, 12], "rang": [4, 5, 7, 8, 9, 11, 12, 21], "pixel": 4, "w": [4, 7, 8, 16, 18], "time": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 18, 21, 22, 23], "h": [4, 11, 22], "f": [4, 5, 11, 12, 20], "f_w": 4, "f_h": 4, "o": [4, 6], "begin": [4, 7, 8, 9, 11], "equat": [4, 7, 8, 9, 11, 18], "nonumb": [4, 11], "x": [4, 5, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "sum": [4, 5, 6, 15], "limits_": 4, "j": [4, 7, 8, 9, 13, 15], "0": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 20, 22], "naiv": [4, 5, 7, 8, 9], "parallel": [4, 7, 8, 9], "thread": [4, 6, 7, 8, 9, 10, 11, 16, 17, 19, 22, 23], "avoid": [4, 6, 15, 23], "confus": 4, "around": [4, 10], "term": 4, "refer": [4, 5, 6, 7, 8, 9, 10, 12, 14, 17, 22], "shown": [4, 6, 17], "press": [4, 7, 8, 9, 11, 15], "shift": [4, 7, 8, 9, 11, 15], "writefil": [4, 15], "convolution_na": [4, 5], "cu": [4, 5, 12, 15, 19, 21], "__global__": [4, 7, 9, 11, 13, 15, 19, 21], "void": [4, 7, 8, 9, 11, 13, 15, 19, 20, 21], "convolution_kernel": [4, 5], "float": [4, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 18, 19, 20, 21, 22], "int": [4, 6, 7, 8, 9, 11, 13, 15, 19, 21, 22], "blockidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "blockdim": [4, 19, 22], "threadidx": [4, 7, 8, 9, 11, 13, 15, 19, 21], "image_height": 4, "image_width": 4, "filter_height": 4, "filter_width": 4, "input_width": 4, "run_kernel": [4, 5, 6, 10, 22], "our": [4, 7, 8, 9, 11, 15, 19, 20], "But": [4, 7, 8, 9, 11, 19], "data": [4, 6, 7, 8, 9, 11, 12, 15, 16, 17, 19, 20, 22], "np": [4, 6, 11, 15, 19, 20], "filter_s": 4, "17": [4, 5, 7, 8, 9, 12], "output_s": 4, "4096": [4, 5, 7, 8, 9, 12, 15], "prod": [4, 5, 12], "border_s": 4, "input_s": [4, 5, 12], "output_imag": 4, "zero": [4, 5, 11, 12, 15], "astyp": [4, 5, 7, 8, 9, 11, 12, 13, 15, 19, 21], "float32": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "input_imag": 4, "random": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 21, 22], "randn": [4, 5, 12, 13, 15, 19, 21], "conv_filt": 4, "now": [4, 6, 7, 8, 9, 11, 12, 15, 19], "structur": [4, 6, 7, 8, 15, 19], "kernel_nam": [4, 6, 12, 20, 21, 22], "kernel_sourc": [4, 6, 20, 22], "problem_s": [4, 5, 6, 7, 8, 9, 11, 12, 15, 19, 20, 22, 23], "ellipsi": 4, "indic": [4, 18, 23], "mani": [4, 6, 7, 8, 9, 15, 16, 17, 18, 22], "won": 4, "right": [4, 7, 8, 9, 11, 14], "interest": [4, 10, 20], "five": [4, 6, 19], "string": [4, 6, 7, 8, 9, 10, 15, 16, 17, 19, 20, 22], "domain": [4, 7, 8, 9, 10, 11, 22], "three": [4, 5, 15], "dimens": [4, 6, 7, 8, 9, 10, 11, 12, 15, 16, 18, 19, 22, 23], "dictionari": [4, 6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "simpli": [4, 5, 6, 7, 8, 9, 11, 18, 19, 22], "cell": [4, 7, 8, 9, 11, 15], "wrote": 4, "determin": [4, 7, 8, 9, 11, 17, 18], "grid": [4, 6, 7, 8, 9, 10, 12, 15, 22, 23], "abov": [4, 6, 7, 8, 9, 11, 14, 15, 19, 20], "divid": [4, 7, 8, 9, 11, 12, 15, 22], "divisor": [4, 6, 7, 8, 9, 15, 22], "scalar": [4, 7, 8, 9, 11, 22], "therefor": [4, 5, 7, 8, 9, 11, 12, 15], "exactli": [4, 6, 7, 8, 9, 15, 17], "order": [4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19, 22], "match": [4, 5, 6], "32": [4, 6, 7, 8, 9, 11, 13, 15, 19, 22], "bit": [4, 6, 7, 8, 9, 11, 12, 15], "final": [4, 5, 7, 8, 9, 11], "anyth": 4, "insert": [4, 5, 6, 9, 11, 12, 15, 19, 21, 22, 23], "preprocessor": [4, 6, 22], "statement": [4, 9, 11, 15, 21], "valu": [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 17, 18, 19, 22], "were": [4, 7, 8, 9, 11, 15, 22], "i_like_convolut": 4, "42": 4, "definit": [4, 11, 22], "unless": 4, "cours": [4, 7, 8, 9, 14, 15], "somewher": 4, "token": 4, "freeli": 4, "few": [4, 7, 8, 9, 11, 12, 21], "special": [4, 7, 8, 9, 17, 19, 23], "notic": [4, 7, 8, 9], "haven": [4, 14], "yet": [4, 6, 11, 12, 19], "basic": [4, 6, 7, 8, 9, 19], "block_size_x": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 19, 21, 22], "block_size_i": [4, 5, 7, 8, 9, 11, 12, 15, 22], "block_size_z": [4, 7, 8, 9, 11, 22], "interpret": 4, "z": [4, 6, 11, 22], "block_size_nam": [4, 6, 22], "let": [4, 6, 7, 8, 9, 19, 21], "creation": [4, 13, 18], "trusti": 4, "old": 4, "16": [4, 5, 7, 8, 9, 11, 12, 15], "dict": [4, 5, 6, 9, 12, 13, 17, 18, 19, 21, 22], "undefin": [4, 6, 7, 8, 9, 15], "filter_heigth": 4, "could": [4, 5, 6, 7, 8, 9, 12, 14, 15, 17, 18, 21, 22], "runtim": [4, 6, 7, 8, 9, 13, 14, 17, 21], "setup": [4, 7, 8, 9, 12, 14, 17, 20], "everyth": [4, 6, 7, 8, 9], "answer": [4, 5, 6, 7, 8, 9, 10, 22], "alloc": [4, 6, 7, 8, 9, 10, 12, 22], "move": [4, 6, 7, 12, 15, 18, 22], "content": [4, 6, 22], "deriv": [4, 6, 7, 8, 9, 16], "retriev": [4, 6, 22], "free": [4, 7, 8, 9, 12, 14, 15], "return": [4, 5, 6, 7, 8, 9, 11, 12, 15, 17, 18, 19, 20, 22], "contrast": 4, "wa": [4, 6, 7, 8, 9, 17, 22], "finish": [4, 6, 8, 11, 12, 17], "than": [4, 7, 8, 9, 11, 16, 17, 18, 22, 23], "highli": [4, 13, 15], "parametr": 4, "long": [4, 7, 8, 9, 11, 12, 15, 20], "instead": [4, 6, 10, 15, 22], "littl": [4, 7, 8, 9, 15], "ve": [4, 7, 8, 9, 14, 15], "familiar": [4, 15], "kernel_str": [4, 5, 6, 7, 8, 9, 12, 13, 18, 22], "tune_param": [4, 5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21, 22], "similarli": 4, "singl": [4, 5, 6, 7, 8, 9, 12, 15, 17, 21, 22], "wai": [4, 7, 8, 9, 11, 12, 13, 14, 15, 16, 17, 22], "64": [4, 7, 8, 9, 13, 15, 19, 21], "128": [4, 7, 8, 9, 13, 19, 21], "try": [4, 6, 7, 8, 9, 14, 15, 18, 22], "env": [4, 6, 18, 19, 22], "cartesian": [4, 11], "product": [4, 7, 8, 22], "realli": [4, 7, 8, 9, 14], "howev": [4, 5, 7, 8, 9, 12, 14, 15, 17, 20, 21, 22], "lot": [4, 7, 8, 9, 15, 17, 19, 20, 22], "problemat": 4, "explain": [4, 6, 7, 8, 9, 12, 14, 15, 16, 19, 21, 22], "illeg": 4, "2048": 4, "1024": [4, 7, 8, 9, 19], "fail": [4, 6, 14, 22], "too": [4, 7, 8, 9, 11, 12, 15, 22], "regist": [4, 7, 8, 9, 15, 17], "silent": 4, "verbos": [4, 5, 6, 7, 8, 9, 12, 22], "bound": [4, 6, 15, 18], "ignor": [4, 6, 7, 8, 9, 22], "two": [4, 6, 7, 8, 9, 10, 15, 16, 18, 22], "thing": [4, 12, 15], "record": [4, 6, 7, 17, 22], "show": [4, 7, 8, 9, 10, 13, 16, 20], "secondli": [4, 15], "experi": 4, "took": [4, 7, 9, 18, 19, 22], "place": [4, 7, 8, 9, 17, 18, 19, 22], "That": [4, 7, 8, 9, 12, 15, 16, 19], "mean": [4, 12, 15, 16, 18, 20, 21, 23], "softwar": [4, 7, 8, 9, 13, 14, 17, 18, 19], "along": [4, 6, 14, 19, 23], "second": [4, 5, 6, 7, 8, 9, 11, 15, 16, 17, 18, 22], "alwai": [4, 6, 7, 8, 9], "circumst": 4, "obtain": [4, 7, 8, 9, 11, 17], "promis": 4, "tile": [4, 10, 15], "factor": [4, 7, 8, 9, 10, 11, 15, 23], "amount": [4, 7, 8, 9, 15, 16, 22], "particular": [4, 6, 7, 8, 10, 12, 15, 17, 20], "increas": [4, 7, 8, 9, 17], "certain": [4, 6, 7, 8, 9, 10, 17, 23], "tile_size_x": [4, 5, 7, 8, 9, 12, 15], "4": [4, 7, 8, 9, 11, 15, 17], "tile_size_i": [4, 5, 7, 8, 9, 12, 15, 22], "understand": 4, "everi": [4, 5, 7, 8, 9, 10, 17, 19], "fewer": [4, 7, 8, 9], "total": [4, 6, 7, 8, 9, 15, 16, 19], "stai": 4, "tell": [4, 7, 8, 9, 10, 12, 15, 19, 20], "influenc": 4, "did": [4, 7, 8, 9, 15], "mimick": 4, "behavior": [4, 15, 17, 22], "assum": [4, 6, 7, 8, 9, 15, 22], "far": [4, 7, 8, 9, 15, 19], "grid_div_x": [4, 5, 7, 8, 9, 12, 15, 22], "grid_div_i": [4, 5, 7, 8, 9, 12, 15, 22], "decreas": [4, 15], "correspondingli": 4, "displai": 4, "commonli": [4, 7, 8, 9, 14, 15], "gflop": [4, 6, 10, 15, 16], "giga": [4, 15], "compos": [4, 6, 15, 16], "lambda": [4, 6, 7, 8, 15, 16, 22], "collect": [4, 6, 7, 8, 9, 11, 15, 17, 20], "ordereddict": [4, 7, 8, 9, 11, 15, 16], "p": [4, 6, 15, 16, 20, 22], "1e9": [4, 15], "1e3": [4, 7, 8, 9, 15, 16], "expand": [4, 13, 15, 17], "sinc": [4, 9, 11, 13, 15, 21], "And": [4, 7, 8, 9, 18, 21, 22], "know": [4, 7, 8, 9, 15, 16], "enough": [4, 5, 15], "abl": [4, 6, 7, 8, 9], "own": [4, 9, 12, 14, 16, 17], "whenev": 5, "good": [5, 7, 8, 9, 23], "fast": [5, 7, 8, 9], "instanc": [5, 6, 7, 8, 9, 12, 17, 22], "none": [5, 6, 17, 18, 22], "onc": [5, 6, 7, 8, 9, 11, 17, 22], "comparison": [5, 13], "allclos": [5, 22], "maximum": [5, 6, 11, 18, 22], "absolut": [5, 22], "1e": [5, 22], "6": [5, 7, 8, 9, 11, 12, 22], "toler": 5, "atol": [5, 6, 22], "convolution_correct": 5, "py": [5, 12, 14], "demonstr": [5, 9, 10, 15], "r": [5, 12], "cmem_arg": [5, 6, 22], "d_filter": 5, "arg": [5, 6, 7, 8, 9, 11, 12, 13, 15, 18, 19, 20, 21], "field": [5, 7, 8, 9], "its": [5, 6, 7, 8, 9, 11, 13, 14, 15, 16, 17, 22], "almost": [5, 7, 8, 9, 17], "whose": [5, 22], "trust": [5, 18], "construct": [5, 15], "There": [5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 17, 19, 22, 23], "precomput": 5, "flexibl": [5, 7, 8, 15], "callabl": [5, 6, 22], "accept": [5, 6, 18, 22], "cpu_result": 5, "gpu_result": [5, 7, 9], "although": 5, "semant": 5, "posit": [5, 6, 11, 18, 21, 22], "reflect": [5, 17], "reduct": [5, 16, 22], "snippet": 5, "sum_x": 5, "custom": [5, 10, 16, 17, 20], "def": [5, 6, 7, 8, 9, 11, 17, 20], "verify_partial_reduc": 5, "isclos": 5, "first_kernel": 5, "_": [5, 7, 8, 9], "sum_float": 5, "map": [5, 10, 11], "third": [5, 15], "partial": [5, 7, 8, 9, 10], "cpu": [5, 8, 9, 12], "achiev": [5, 9], "element": [5, 7, 8, 9, 15, 16, 19, 20, 22], "necessarili": [5, 12], "section": [6, 7, 8, 9], "intern": [6, 13, 18, 21], "mostli": [6, 13, 22], "relev": [6, 13, 17], "develop": [6, 10, 13, 14], "extens": 6, "architectur": [6, 17], "At": [6, 11, 22], "expos": 6, "respons": 6, "iter": [6, 7, 8, 9, 11, 15, 17, 18, 19, 22], "brute_forc": [6, 22], "valid": [6, 10, 15, 22], "random_sampl": [6, 22], "sampl": [6, 18, 22], "advanc": [6, 21, 22], "being": [6, 7, 8, 9, 15, 17, 18, 22], "strategy_opt": [6, 18, 22], "sai": [6, 7, 8, 9, 19, 21], "foreseen": 6, "futur": [6, 13, 22, 23], "high": [6, 7, 8, 9, 13, 15, 17], "low": [6, 7, 8, 9, 15], "abstract": [6, 17], "ready_argument_list": 6, "build": [6, 7, 8, 9], "bottom": 6, "either": [6, 11, 18, 21, 22], "typic": [6, 14, 15, 22], "gcc": 6, "fortran": [6, 10, 21], "turn": 6, "launch": [6, 7, 8, 9, 12, 17, 22], "rest": [6, 7, 8, 9], "helper": [6, 17], "get_opt": 6, "suppli": [6, 12, 15, 18, 21, 22], "get_strategy_docstr": 6, "method": [6, 7, 8, 9, 12, 15, 17, 18], "make_strategy_options_doc": 6, "scale_from_param": 6, "ep": [6, 18], "func": [6, 17, 22], "invers": 6, "unscal": 6, "setup_method_argu": 6, "setup_method_opt": 6, "tuning_opt": [6, 18], "snap_to_nearest_config": 6, "closest": 6, "unscale_and_snap_to_nearest": 6, "snap": 6, "scale": 6, "nearest": [6, 22], "class": [6, 17, 18], "kernel_opt": 6, "device_opt": 6, "__init__": 6, "instanti": [6, 21], "kernelsourc": 6, "parameter_spac": [6, 18], "iterfac": 6, "platform": [6, 13, 14, 17, 22], "quiet": [6, 22], "compiler_opt": [6, 22], "7": [6, 7, 8, 9, 11, 22], "offer": 6, "bool": [6, 20, 22], "gpu_arg": 6, "skip_nvml_set": 6, "benchmark_continu": 6, "durat": [6, 17], "benchmark_default": 6, "check_kernel_output": 6, "compile_kernel": 6, "copy_constant_memory_arg": 6, "recent": [6, 14, 17], "copy_shared_memory_arg": 6, "smem_arg": [6, 22], "copy_texture_memory_arg": 6, "texmem_arg": [6, 22], "create_kernel_inst": 6, "get_environ": 6, "memcpy_dtoh": [6, 7], "dest": 6, "src": 6, "copi": [6, 7, 8, 9, 12, 19, 22], "static": 6, "preprocess_gpu_argu": 6, "old_argu": 6, "flat": 6, "given": [6, 7, 8, 9, 11, 17, 18, 22], "mem": 6, "set_nvml_paramet": 6, "nvml": [6, 23], "leak": 6, "group": [6, 7, 8, 9, 22], "maintain": 6, "state": [6, 7, 8, 9, 17, 22], "interact": [6, 17], "properti": [6, 15, 22], "context": [6, 7, 9, 11], "kernel_inst": 6, "lookup": 6, "directli": [6, 7, 8, 9, 12, 15, 17, 21, 22], "ndarrai": [6, 11], "format": [6, 7, 8, 20], "kei": [6, 7, 8, 9, 15, 18, 19, 22], "symbol": [6, 22], "similar": [6, 12, 15, 22], "regular": [6, 9, 17], "int32": [6, 13, 19, 21, 22], "kernel_finish": 6, "devicealloc": 6, "memcpy_htod": [6, 7], "memset": 6, "unsign": [6, 8], "byte": [6, 20, 22], "tupl": [6, 9, 11, 18, 22], "start_ev": 6, "event": [6, 7, 12, 17], "mark": 6, "stop_ev": 6, "synchron": [6, 7, 9, 11, 15, 16], "halt": [6, 12], "until": [6, 12], "task": 6, "rawkernel": 6, "cudeviceptr": 6, "cufunct": 6, "must": [6, 16, 22], "buffer": [6, 8, 20], "fill": [6, 15], "item": [6, 7, 8, 9, 11], "ndrang": 6, "cfunction": 6, "cleanup_lib": 6, "previous": [6, 7, 8, 9, 15], "librari": [6, 10, 17, 20], "kernelinst": 6, "repres": [6, 7, 8, 9], "tunabl": [6, 7, 8, 9, 10, 11, 15, 16, 17, 18, 19, 21, 22, 23], "ctype": 6, "_funcptr": 6, "asynchron": 6, "memcpi": [6, 12], "c_arg": 6, "robust": 6, "averag": [6, 7, 8, 9, 12, 17], "ptr": 6, "pionter": 6, "compilationfailedconfig": 6, "errorconfig": 6, "invalidconfig": 6, "npencod": 6, "skipkei": 6, "ensure_ascii": 6, "check_circular": 6, "allow_nan": 6, "sort_kei": 6, "indent": 6, "separ": [6, 10, 12, 21], "dump": [6, 7, 8], "json": [6, 7, 8, 10, 22], "obj": 6, "subclass": 6, "serializ": 6, "rais": 6, "typeerror": 6, "arbitrari": 6, "els": 6, "jsonencod": 6, "runtimefailedconfig": 6, "skippablefailur": 6, "stopcriterionreach": 6, "thrown": 6, "stop": [6, 18], "criterion": [6, 18], "reach": 6, "check_argument_list": 6, "check_argument_typ": 6, "dtype": [6, 20], "kernel_argu": 6, "check_restrict": 6, "whether": [6, 16, 18, 22], "meet": 6, "check_stop_criterion": 6, "max_fev": [6, 18, 22], "exceed": 6, "check_thread_block_dimens": 6, "max_thread": 6, "check_tune_params_list": 6, "simulation_mod": [6, 22], "forbidden": 6, "compile_restrict": 6, "monolith": 6, "try_to_constraint": 6, "union": 6, "str": [6, 7, 8, 9, 11], "constraint": 6, "pars": [6, 7, 8], "config_valid": 6, "max": 6, "convert_constraint_restrict": 6, "convert": [6, 7, 8], "backward": 6, "correct_open_cach": 6, "open_cach": 6, "properli": 6, "close": [6, 7, 8, 9], "pretend": 6, "cuda_error_check": 6, "statu": 6, "delete_temp_fil": 6, "delet": 6, "temporari": 6, "complain": 6, "detect_languag": 6, "attempt": [6, 21], "detect": [6, 18, 21, 22], "dump_cach": 6, "omit": 6, "sever": [6, 7, 8, 9, 10, 11, 14, 15, 21, 22], "store_cach": 6, "speed": 6, "great": [6, 7, 8, 9, 19], "power": [6, 15, 17, 23], "get_best_config": 6, "objective_higher_is_bett": [6, 16, 22], "best": [6, 7, 8, 11, 15, 18, 21, 22, 23], "accord": [6, 22], "get_config_str": 6, "compact": 6, "represent": [6, 20], "get_grid_dimens": 6, "current_problem_s": 6, "grid_div": 6, "dim": 6, "get_instance_str": 6, "debug": 6, "advis": 6, "get_kernel_str": [6, 7, 8, 9], "One": [6, 7, 8, 9, 17, 20], "get_problem_s": 6, "get_smem_arg": 6, "get_temp_filenam": 6, "suffix": [6, 22], "form": [6, 15, 17, 18], "temp_x": 6, "larg": [6, 7, 8, 9, 11, 22], "integ": [6, 17, 20, 22], "get_thread_block_dimens": 6, "convent": [6, 12, 22], "get_total_tim": 6, "overhead_tim": 6, "looks_like_a_filenam": 6, "normalize_verify_funct": 6, "v": [6, 7, 8, 9, 11], "normal": [6, 18, 22], "result_host": 6, "keyword": 6, "behaviour": 6, "parse_restrict": 6, "prepare_kernel_str": 6, "prepend": [6, 9], "seri": [6, 11], "By": [6, 12, 15, 18, 22], "macro": 6, "made": 6, "print_config": 6, "print_config_output": 6, "process_cach": 6, "device_nam": [6, 22], "tune_params_kei": 6, "x1": 6, "x2": 6, "xn": 6, "234342": 6, "y1": 6, "y2": 6, "yn": 6, "134233": 6, "bracket": 6, "earlier": [6, 7, 8, 9, 11], "abruptli": 6, "process_metr": 6, "calcul": [6, 11], "express": [6, 7, 8, 9, 10, 12, 15, 22], "10000": 6, "read_cach": 6, "cachefil": [6, 22], "read_fil": 6, "replace_param_occurr": 6, "occurr": 6, "setup_block_and_grid": 6, "to_valid_nvrtc_gpu_arch_cc": 6, "compute_cap": 6, "index": [6, 18], "group__opt": 6, "write_fil": 6, "whole": [7, 8, 9, 15, 18], "model": [7, 8, 9, 13], "physic": 7, "numer": [7, 8, 9], "introduc": [7, 8, 9, 15, 17], "redistribut": [7, 8, 9], "region": [7, 8, 9], "concentr": [7, 8, 9], "bulk": [7, 8, 9], "motion": [7, 8, 9], "concept": [7, 8, 9], "wide": [7, 8, 9, 14, 15], "chemistri": [7, 8, 9], "biologi": [7, 8, 9], "suppos": [7, 8, 9], "metal": [7, 8, 9], "sheet": [7, 8, 9], "temperatur": [7, 8, 9, 17, 18, 23], "equal": [7, 8, 9, 15, 22], "degre": [7, 8, 9], "everywher": [7, 8, 9], "heat": [7, 8, 9], "thousand": [7, 8, 9], "instant": [7, 8, 9, 11], "hotspot": [7, 8, 9], "cooler": [7, 8, 9], "area": [7, 8, 9, 15], "melt": [7, 8, 9], "loss": [7, 8, 9], "radiat": [7, 8, 9], "frac": [7, 8, 9], "d": [7, 8, 9, 11, 18, 19], "spatial": [7, 8, 9], "descret": [7, 8, 9], "2d": [7, 8, 9, 10], "quantiti": [7, 8, 9, 16, 17, 22], "nx": [7, 8, 9, 11], "equi": [7, 8, 9], "distant": [7, 8, 9], "direct": [7, 8, 9, 12, 15, 16, 22], "ny": [7, 8, 9, 11], "distanc": [7, 8, 9, 18], "delta": [7, 8, 9], "central": [7, 8, 9], "approxim": [7, 8, 9], "x_i": [7, 8, 9, 11], "x_": [7, 8, 9], "approx": [7, 8, 9], "u_": [7, 8, 9], "2u_": [7, 8, 9], "y_": [7, 8, 9], "estim": [7, 8, 9], "next": [7, 8, 9, 15, 20], "simplifi": [7, 8, 9], "formula": [7, 8, 9], "4u_": [7, 8, 9], "simplic": [7, 8, 9, 11], "assumpt": [7, 8, 9], "boundari": [7, 8, 9], "condit": [7, 8, 9, 15], "dt": [7, 8, 9], "225": [7, 8, 9], "test": [7, 8, 9, 10, 14, 15, 17, 22], "initi": [7, 8, 9, 20], "hot": [7, 8, 9], "plot": [7, 8, 9], "color": [7, 8, 9], "matplotlib": [7, 8, 9, 14], "pyplot": [7, 8, 9], "inlin": [7, 8, 9], "get_initial_condit": [7, 8, 9], "ones": [7, 8, 9, 23], "randint": [7, 8, 9], "1000": [7, 8, 9, 11], "2000": [7, 8, 9], "fig": [7, 8, 9], "ax1": [7, 8, 9], "ax2": [7, 8, 9], "subplot": [7, 8, 9], "imshow": [7, 8, 9], "lt": [7, 8, 9], "axesimag": [7, 8, 9], "0x2aaab952f240": 7, "gt": [7, 8, 9], "quick": [7, 8, 9], "later": [7, 8, 9, 11, 22], "field_copi": [7, 8], "4164": 7, "018869400024": 7, "0x2aab1c98b3c8": 7, "worri": [7, 9], "terminologi": [7, 9], "text": [7, 9, 15], "5": [7, 8, 9, 11, 18], "225f": [7, 8, 9], "diffuse_kernel": [7, 8, 9], "u_new": [7, 8, 9], "0f": [7, 8, 9], "togeth": [7, 8, 9, 14, 22], "impact": [7, 8, 9, 12], "fix": [7, 8, 9, 18, 22], "unrol": [7, 8, 9, 10, 15, 23], "loop": [7, 8, 9, 10, 15, 23], "drv": 7, "sourcemodul": [7, 9, 11], "init": 7, "make_context": 7, "devprop": 7, "k": [7, 8, 9, 11, 13, 15, 19], "get_devic": 7, "get_attribut": 7, "cc": 7, "compute_capability_major": 7, "compute_capability_minor": 7, "u_old": [7, 9], "mem_alloc": 7, "nbyte": 7, "block_size_str": [7, 9], "arch": 7, "sm_": 7, "get_funct": [7, 9, 11], "boilerpl": [7, 8, 9], "moment": [7, 8, 9, 22], "serv": [7, 8, 9, 16, 18], "guess": [7, 8, 9], "pair": [7, 8, 9], "500": [7, 8, 9], "time_sinc": 7, "zeros_lik": [7, 11, 13, 15, 19, 21], "set_titl": [7, 8, 9], "53": [7, 8, 9], "423038482666016": 7, "0x2aaabbdcb2e8": 7, "faster": [7, 8, 9, 15], "cleanup": 7, "pop": 7, "think": [7, 8, 9], "messi": [7, 8, 9], "got": [7, 8, 9], "cleaner": [7, 8, 9], "plai": [7, 8, 9], "difficult": [7, 8, 9, 20, 21], "rather": [7, 8, 9, 22], "underutil": [7, 8, 9], "purpos": [7, 8, 9, 12, 15, 22, 23], "feel": [7, 8, 9], "48": [7, 8, 9], "care": [7, 8, 9], "appropi": [7, 8, 9], "fly": [7, 8, 9], "12": [7, 8, 9], "13": [7, 8, 9], "geforc": [7, 8, 9, 11], "gtx": [7, 8, 9, 11], "titan": [7, 8, 9], "22305920124": 7, "779033613205": 7, "824838399887": 7, "900499212742": 7, "999763202667": 7, "727967989445": 7, "752479994297": 7, "797900807858": 7, "876627194881": 7, "93347837925": 7, "766662418842": 7, "803033602238": 7, "853574407101": 7, "971545600891": 7, "763775992393": 7, "791257584095": 7, "848044800758": 7, "922745585442": 7, "792595207691": 7, "822137594223": 7, "893279993534": 7, "millisecond": [7, 8, 9], "matter": [7, 8, 9, 12], "analyz": [7, 8, 9], "seem": [7, 8, 9], "vari": [7, 8, 9, 11, 15, 16], "addtion": [7, 8, 9], "among": [7, 8, 9, 13, 18], "128x32": [7, 8, 9], "likewis": [7, 8, 9], "becom": [7, 8, 9, 17, 18], "affect": [7, 8, 9, 15], "within": [7, 8, 9, 11, 15, 18, 22], "exchang": [7, 8, 9], "fact": [7, 8, 9, 12], "commun": [7, 8, 9], "idea": [7, 8, 9, 12, 15, 23], "l2": [7, 8, 9], "closer": [7, 8, 9], "multiprocessor": [7, 8, 9], "l1": [7, 8, 9], "fine": [7, 8, 9], "grain": [7, 8, 9], "manag": [7, 8, 9, 15, 17], "cost": [7, 8, 9, 18], "overhead": [7, 8, 9, 15], "degrad": [7, 8, 9], "intermedi": [7, 8, 9], "mind": [7, 8, 9], "14": [7, 8, 9], "tx": [7, 8, 9, 15], "ty": [7, 8, 9, 15], "bx": [7, 8, 9, 11], "__shared__": [7, 9, 15], "sh_u": [7, 8, 9], "pragma": [7, 8, 9, 15], "__syncthread": [7, 8, 9, 15], "75041918755": 7, "18713598251": 7, "09015038013": 7, "06844799519": 7, "09730558395": 7, "14420480728": 7, "05957758427": 7, "07508480549": 7, "0731967926": 7, "14729599953": 7, "08389122486": 7, "10700161457": 7, "10125439167": 7, "31661438942": 7, "0629119873": 7, "04807043076": 7, "054880023": 7, "12033278942": 7, "06672639847": 7, "05816960335": 7, "12000002861": 7, "merg": [7, 8, 9, 15], "half": [7, 8, 9], "doubl": [7, 8, 9, 20, 21], "cover": [7, 8, 9, 18], "beyond": [7, 8, 9, 22], "reduc": [7, 8, 9, 15], "condens": [7, 8, 9], "keep": [7, 8, 9, 15, 20], "importantli": [7, 8, 9], "worst": [7, 8, 9], "15": [7, 8, 9, 21], "tj": [7, 8, 9], "ti": [7, 8, 9, 11], "somehow": [7, 8, 9], "larger": [7, 8, 9, 12, 18, 21], "insid": [7, 8, 9, 12, 15, 21, 22], "round": [7, 8, 9, 22], "arithmet": [7, 8, 9, 22], "evalu": [7, 8, 9, 15, 18, 22], "759308815": 7, "29789438248": 7, "06983039379": 7, "2634239912": 7, "997139203548": 7, "843692803383": 7, "05549435616": 7, "862348806858": 7, "750636804104": 7, "19084160328": 7, "876377594471": 7, "714169609547": 7, "875001597404": 7, "691116797924": 7, "575859189034": 7, "759679996967": 7, "622867202759": 7, "650336003304": 7, "09794559479": 7, "826515209675": 7, "692665600777": 7, "78363519907": 7, "646092808247": 7, "554745602608": 7, "716115188599": 7, "581280004978": 7, "662566399574": 7, "07386879921": 7, "833420813084": 7, "705055999756": 7, "840755212307": 7, "652575993538": 7, "569388794899": 7, "689356791973": 7, "597267186642": 7, "675232005119": 7, "10033922195": 7, "860332798958": 7, "731891202927": 7, "867276787758": 7, "68781440258": 7, "595276796818": 7, "735436797142": 7, "60216319561": 7, "852166390419": 7, "15089921951": 7, "852575981617": 7, "705932807922": 7, "888671982288": 7, "673248004913": 7, "563417613506": 7, "761139214039": 7, "621254396439": 7, "676595199108": 7, "06709122658": 7, "804953610897": 7, "685670387745": 7, "801798415184": 7, "632006394863": 7, "542387211323": 7, "722668802738": 7, "578745603561": 7, "618598401546": 7, "08220798969": 7, "821881604195": 7, "687955200672": 7, "77759360075": 7, "618003201485": 7, "539891195297": 7, "705900788307": 7, "568556785583": 7, "624492788315": 7, "0799423933": 7, "832300806046": 7, "70140799284": 7, "835481595993": 7, "638348805904": 7, "550105595589": 7, "667251205444": 7, "576044797897": 7, "732409596443": 7, "15916161537": 7, "869497597218": 7, "733248019218": 7, "890803205967": 7, "677363204956": 7, "577215993404": 7, "730982398987": 7, "58035838604": 7, "10066559315": 7, "837804794312": 7, "691385602951": 7, "851040017605": 7, "666656005383": 7, "560505592823": 7, "771103990078": 7, "626163220406": 7, "694451200962": 7, "11514236927": 7, "837299215794": 7, "703302407265": 7, "806828796864": 7, "648620784283": 7, "562521612644": 7, "760915207863": 7, "605760002136": 7, "690009605885": 7, "10740480423": 7, "841631996632": 7, "700883197784": 7, "838195204735": 7, "649779188633": 7, "56585599184": 7, "7168192029": 7, "59088640213": 7, "69627519846": 7, "3269824028": 7, "02665598392": 7, "840908801556": 7, "03752319813": 7, "788345599174": 7, "662041604519": 7, "85437438488": 7, "680422389507": 7, "0759360075": 7, "801996803284": 7, "666003203392": 7, "808000004292": 7, "643359994888": 7, "544691193104": 7, "741964805126": 7, "60942081213": 7, "681350398064": 7, "05262081623": 7, "792108798027": 7, "66344319582": 7, "768064010143": 7, "625260794163": 7, "540352010727": 7, "721862399578": 7, "579411196709": 7, "626976013184": 7, "06332798004": 7, "808211183548": 7, "679372787476": 7, "803718411922": 7, "627136015892": 7, "538227200508": 7, "682188808918": 7, "573836791515": 7, "725548803806": 7, "13023357391": 7, "843411195278": 7, "713843202591": 7, "85886080265": 7, "657920002937": 7, "565254402161": 7, "697094392776": 7, "579904007912": 7, "07484800816": 7, "801119995117": 7, "667347204685": 7, "799059200287": 7, "643820810318": 7, "542937588692": 7, "740518403053": 7, "615148806572": 7, "731334400177": 7, "07002239227": 7, "805299210548": 7, "675923216343": 7, "782060790062": 7, "631142401695": 7, "540383994579": 7, "723999989033": 7, "578681600094": 7, "726335990429": 7, "13297917843": 7, "844428789616": 7, "710278391838": 7, "835494399071": 7, "637958395481": 7, "567417597771": 7, "699366402626": 7, "588492810726": 7, "tri": [7, 8, 9, 18], "grow": [7, 8, 9], "quickli": [7, 8, 9], "went": [7, 8, 9, 11], "72": [7, 8, 9], "26": [7, 8, 9], "32x2": [7, 8, 9], "64x4": [7, 8, 9], "four": [7, 8, 9], "best_tim": [7, 8], "min": [7, 8], "05": [7, 8], "join": [7, 8], "nice": [7, 8], "stdout": [7, 8], "why": [7, 8, 12, 16], "easili": [7, 8, 17], "easi": [7, 8, 16, 17, 22], "csv": [7, 8, 10], "analysi": [7, 8, 13], "panda": [7, 8, 10, 14], "18": [7, 8, 9], "fp": [7, 8], "datafram": [7, 8], "df": [7, 8], "to_csv": [7, 8], "0x2aab1de088d0": 8, "01": 8, "sy": 8, "140": 8, "wall": 8, "98": 8, "__kernel": 8, "get_group_id": 8, "get_local_id": 8, "cl": 8, "ctx": 8, "create_some_context": 8, "mf": 8, "mem_flag": 8, "a_h": 8, "a_d": 8, "read_writ": 8, "copy_host_ptr": 8, "hostbuf": 8, "b_d": 8, "kernel_src": 8, "prg": 8, "queue": 8, "commandqueu": 8, "run_gpu": 8, "444": 8, "154": 8, "598": 8, "985": 8, "enqueue_copi": 8, "1748096": 8, "7284544": 8, "7707904": 8, "8573184": 8, "8380288": 8, "686528": 8, "69648": 8, "7461632": 8, "818304": 8, "771072": 8, "7190464": 8, "7522432": 8, "7982208": 8, "9624512": 8, "7214464": 8, "7453312": 8, "8028416": 8, "8922624": 8, "747328": 8, "7860736": 8, "8637184": 8, "__local": 8, "barrier": 8, "clk_local_mem_f": 8, "8449472": 8, "1912576": 8, "1035136": 8, "0927808": 8, "1140736": 8, "1790336": 8, "0808192": 8, "0809792": 8, "0836928": 8, "1545856": 8, "1249984": 8, "1264": 8, "1230336": 8, "4015104": 8, "0873216": 8, "0626496": 8, "0692224": 8, "140192": 8, "0801344": 8, "0688128": 8, "1428928": 8, "8844544": 8, "3245952": 8, "0911808": 8, "3039616": 8, "0079296": 8, "84848": 8, "0708288": 8, "857728": 8, "7561792": 8, "231072": 8, "8774336": 8, "7087296": 8, "8772672": 8, "6911872": 8, "5715968": 8, "7584896": 8, "6292032": 8, "6498688": 8, "1145664": 8, "8252928": 8, "6757568": 8, "7881152": 8, "6237696": 8, "544224": 8, "6951168": 8, "5648128": 8, "6452736": 8, "1065792": 8, "8313792": 8, "6905984": 8, "8302656": 8, "6367488": 8, "5478592": 8, "6660672": 8, "5719744": 8, "6551744": 8, "1384064": 8, "8531072": 8, "7078976": 8, "8516672": 8, "6677696": 8, "5685632": 8, "7074048": 8, "5753152": 8, "8228864": 8, "2124736": 8, "8633344": 8, "6921216": 8, "8896384": 8, "6659904": 8, "5582144": 8, "7522624": 8, "6081536": 8, "6664448": 8, "1095936": 8, "8063424": 8, "6717888": 8, "7982848": 8, "6263552": 8, "5289728": 8, "7008832": 8, "567456": 8, "5968704": 8, "1018432": 8, "8117248": 8, "6724736": 8, "7728576": 8, "6038336": 8, "5172352": 8, "6796352": 8, "5470016": 8, "5968448": 8, "1107712": 8, "8237248": 8, "6810944": 8, "821952": 8, "620352": 8, "5230208": 8, "6415552": 8, "5476864": 8, "7168192": 8, "1942016": 8, "8626304": 8, "7099712": 8, "9123328": 8, "6608448": 8, "5631168": 8, "7113024": 8, "556576": 8, "1583104": 8, "8384832": 8, "67856": 8, "845856": 8, "6581248": 8, "54944": 8, "7520064": 8, "6076224": 8, "6842112": 8, "1547072": 8, "8422016": 8, "6895552": 8, "8037312": 8, "6387072": 8, "5383296": 8, "7326656": 8, "5863488": 8, "6813376": 8, "1493952": 8, "8444928": 8, "6929216": 8, "832768": 8, "6389312": 8, "5412672": 8, "698336": 8, "5717568": 8, "676096": 8, "4303104": 8, "0341696": 8, "8365184": 8, "0398656": 8, "7786496": 8, "648928": 8, "8479232": 8, "6508544": 8, "1219392": 8, "7994048": 8, "6492288": 8, "8068416": 8, "6343168": 8, "5235328": 8, "7268928": 8, "5898432": 8, "6633536": 8, "0849664": 8, "7869632": 8, "6458624": 8, "7611968": 8, "613088": 8, "50912": 8, "6972928": 8, "5620608": 8, "601856": 8, "095232": 8, "7967488": 8, "6601472": 8, "7952896": 8, "6047296": 8, "5108224": 8, "6607744": 8, "5492416": 8, "7091136": 8, "171552": 8, "8473408": 8, "6962112": 8, "8663936": 8, "6466816": 8, "5475584": 8, "6754048": 8, "5591744": 8, "108896": 8, "7907264": 8, "6459328": 8, "7965888": 8, "6250816": 8, "5188416": 8, "721408": 8, "5920832": 8, "7068608": 8, "0909248": 8, "7930752": 8, "6524544": 8, "7745216": 8, "6146176": 8, "5116928": 8, "6975872": 8, "5548416": 8, "7075136": 8, "174624": 8, "8384512": 8, "69104": 8, "8335488": 8, "6264192": 8, "5445248": 8, "6719104": 8, "5592064": 8, "19": [8, 9], "solv": 9, "0x7f888f8cd7b8": 9, "4152": 9, "086019515991": 9, "0x7f8865b51f28": 9, "gpuarrai": [9, 11], "tool": [9, 11, 13], "autoinit": [9, 11], "to_gpu": [9, 11], "mod": [9, 11], "t0": [9, 11], "ona": 9, "33": 9, "46109390258789": 9, "0x7f8858b873c8": 9, "1080": [9, 11], "916985595226": 9, "489004802704": 9, "500524806976": 9, "513356792927": 9, "545715200901": 9, "486515200138": 9, "449055999517": 9, "44974719882": 9, "457427197695": 9, "492915201187": 9, "464863997698": 9, "466118401289": 9, "475264000893": 9, "513632011414": 9, "458412796259": 9, "457715201378": 9, "461017608643": 9, "475987195969": 9, "460032004118": 9, "457779198885": 9, "462649595737": 9, "kernel_string_shar": 9, "22673916817": 9, "826361596584": 9, "793516802788": 9, "782112002373": 9, "776639997959": 9, "795135998726": 9, "722777605057": 9, "762777590752": 9, "75422719717": 9, "804876792431": 9, "778656005859": 9, "769734406471": 9, "782495999336": 9, "932281601429": 9, "734028804302": 9, "721625590324": 9, "736511993408": 9, "800019192696": 9, "724966406822": 9, "722969603539": 9, "759430396557": 9, "kernel_string_til": 9, "22200961113": 9, "91601279974": 9, "752838408947": 9, "873651194572": 9, "69833599329": 9, "586931192875": 9, "516473591328": 9, "411392003298": 9, "384262400866": 9, "82159358263": 9, "632607996464": 9, "506457602978": 9, "618758392334": 9, "500288009644": 9, "429862397909": 9, "44995200038": 9, "366150397062": 9, "342201602459": 9, "793542397022": 9, "58026239872": 9, "494163197279": 9, "546316814423": 9, "467059195042": 9, "404249596596": 9, "440895992517": 9, "341376006603": 9, "339692795277": 9, "783923208714": 9, "597920000553": 9, "50277120471": 9, "615475213528": 9, "470937597752": 9, "418393599987": 9, "443519997597": 9, "343961596489": 9, "342540800571": 9, "780352008343": 9, "611705589294": 9, "515667212009": 9, "622534394264": 9, "502195191383": 9, "437388807535": 9, "45568639636": 9, "359289598465": 9, "426995199919": 9, "788947200775": 9, "616556799412": 9, "496121603251": 9, "629164803028": 9, "474841600657": 9, "407667201757": 9, "47406719923": 9, "371507203579": 9, "352531200647": 9, "72023679018": 9, "574816000462": 9, "481817597151": 9, "580928003788": 9, "455724793673": 9, "394975996017": 9, "464659202099": 9, "357107198238": 9, "324083191156": 9, "759910392761": 9, "569177603722": 9, "481279999018": 9, "528115200996": 9, "441734397411": 9, "393126398325": 9, "455404800177": 9, "350457596779": 9, "322547197342": 9, "754201591015": 9, "579827189445": 9, "491852802038": 9, "582751989365": 9, "451283198595": 9, "391807991266": 9, "456275194883": 9, "356716805696": 9, "362937599421": 9, "809894394875": 9, "60433280468": 9, "507142400742": 9, "655827200413": 9, "474092799425": 9, "408166396618": 9, "480531209707": 9, "346707201004": 9, "780134403706": 9, "601049602032": 9, "493900799751": 9, "620384001732": 9, "494553589821": 9, "425414395332": 9, "467033600807": 9, "375468802452": 9, "346079999208": 9, "771052801609": 9, "593977594376": 9, "49723520875": 9, "583270406723": 9, "478079998493": 9, "416320002079": 9, "443942397833": 9, "359744000435": 9, "343545603752": 9, "780960011482": 9, "598758399487": 9, "498617601395": 9, "57678719759": 9, "46561280489": 9, "41324160099": 9, "431225597858": 9, "351263999939": 9, "34440960288": 9, "933260798454": 9, "715257608891": 9, "586604809761": 9, "711615991592": 9, "558771193027": 9, "466284793615": 9, "44043520093": 9, "361823999882": 9, "731839990616": 9, "57044479847": 9, "470220798254": 9, "608800005913": 9, "472665601969": 9, "416352003813": 9, "481376004219": 9, "380812799931": 9, "351923197508": 9, "719257593155": 9, "55171200037": 9, "466758400202": 9, "568435204029": 9, "459654402733": 9, "394380801916": 9, "463052803278": 9, "36409599781": 9, "328998398781": 9, "73579518795": 9, "564575994015": 9, "472236800194": 9, "549024009705": 9, "438406395912": 9, "389945602417": 9, "455193603039": 9, "364051198959": 9, "375519996881": 9, "798195195198": 9, "588998401165": 9, "49552000761": 9, "595462405682": 9, "460972803831": 9, "400672000647": 9, "465132802725": 9, "364627194405": 9, "729363203049": 9, "558815991879": 9, "466655993462": 9, "600819194317": 9, "460281592607": 9, "404908800125": 9, "478739196062": 9, "386668801308": 9, "385510402918": 9, "720915210247": 9, "550668799877": 9, "466937589645": 9, "564921605587": 9, "447974395752": 9, "394271999598": 9, "46233600378": 9, "365190398693": 9, "387827193737": 9, "762003195286": 9, "579007995129": 9, "486649608612": 9, "557331204414": 9, "443033593893": 9, "396070402861": 9, "457075202465": 9, "369555193186": 9, "wish": 9, "modifi": [9, 17], "tile_size_j": 9, "fixed_param": [9, 11], "ceil": [9, 11], "zip": [9, 11], "transfer": [9, 10, 12], "20": [9, 18], "21": 9, "618": 9, "2231903076172": 9, "0x7f887c3d2358": 9, "incorpor": 9, "ifndef": 9, "kerenel": 9, "psedo": 9, "endif": 9, "bypass": 9, "usecas": 10, "test_vector_add": 10, "test_vector_add_parameter": 10, "highlight": 10, "contact": 10, "illustr": 10, "openacc": 10, "dimension": [10, 11, 22], "clean": [10, 15], "center": [10, 11], "lock": [10, 17], "overlap": [10, 12], "shuffl": 10, "pipelin": 10, "consist": [10, 15, 22], "scipi": 10, "algorithm": [10, 13, 18, 22], "cub": 10, "gaussian": 11, "delv": 11, "hand": [11, 15], "sum_": 11, "exp": 11, "beta": [11, 18], "sqrt": 11, "y_i": 11, "z_i": 11, "vector": [11, 12, 19], "coordin": 11, "linalg": 11, "la": 11, "compute_grid": 11, "xgrid": 11, "ygrid": 11, "zgrid": 11, "x0": 11, "y0": 11, "z0": 11, "themselv": 11, "meshgrid": 11, "send": 11, "interv": 11, "256": [11, 13, 19], "suffici": [11, 16], "100": [11, 18, 22], "randomli": [11, 18], "distribut": [11, 15], "linspac": 11, "cpu_grid": 11, "npt": 11, "rand": 11, "xyz": [11, 22], "52320": 11, "160627": 11, "might": [11, 16], "nz": 11, "bz": 11, "kernel_cod": 11, "math": 11, "__host__": 11, "__device__": [11, 21], "b": [11, 13, 15, 18, 19, 21], "addgrid": 11, "xvect": 11, "yvect": 11, "zvect": 11, "dx": 11, "dy": 11, "dz": 11, "assign": 11, "explor": 11, "middl": 11, "henc": [11, 20], "mention": 11, "56833920479": 11, "80796158314": 11, "940044796467": 11, "855628800392": 11, "855359995365": 11, "16174077988": 11, "11877760887": 11, "01592960358": 11, "849273598194": 11, "849235200882": 11, "19029750824": 11, "16199679375": 11, "40401918888": 11, "39618558884": 11, "39508478642": 11, "31647996902": 11, "31470079422": 11, "50787198544": 11, "53760001659": 11, "56709756851": 11, "34500494003": 11, "25130877495": 11, "50662400723": 11, "55267841816": 11, "17987194061": 11, "12309756279": 11, "01125121117": 11, "849631989002": 11, "853708791733": 11, "17051515579": 11, "15584001541": 11, "40074241161": 11, "39547519684": 11, "39331197739": 11, "30295038223": 11, "28725762367": 11, "39589118958": 11, "38867840767": 11, "37724158764": 11, "34344320297": 11, "26213116646": 11, "38793599606": 11, "3775359869": 11, "74003200531": 11, "13276162148": 11, "37233917713": 11, "18835201263": 11, "15777277946": 11, "40247042179": 11, "39366400242": 11, "39439997673": 11, "23719043732": 11, "28542718887": 11, "39207677841": 11, "38956804276": 11, "3778496027": 11, "29814395905": 11, "26398081779": 11, "38625922203": 11, "3754431963": 11, "72981758118": 11, "12483196259": 11, "37322881222": 11, "61618566513": 11, "2194111824": 11, "17600002289": 11, "27082881927": 11, "38787200451": 11, "3835711956": 11, "37543039322": 11, "30227203369": 11, "23127679825": 11, "38627202511": 11, "37677440643": 11, "64358406067": 11, "12255358696": 11, "37474560738": 11, "61655673981": 11, "19179515839": 11, "99912958145": 11, "213971138": 11, "16430072784": 11, "38772480488": 11, "3735104084": 11, "54432649612": 11, "05524477959": 11, "36935677528": 11, "42449922562": 11, "10455036163": 11, "67516155243": 11, "programmat": 11, "30": 11, "minimum": 11, "84": 11, "suit": [11, 22], "grid_dim": 11, "associ": 11, "substitut": 11, "ourselv": 11, "extract": 11, "manual": [11, 14], "exlicitli": 11, "accur": [11, 17], "xgpu": 11, "ygpu": 11, "zgpu": 11, "grid_gpu": 11, "80": 11, "133200": 11, "lower": [11, 17, 18], "roughli": [11, 15], "40000": 11, "across": [12, 15], "qualiti": 12, "itself": [12, 13, 22], "precis": 12, "plain": 12, "omp_get_wtim": 12, "openmp": 12, "convolution_stream": 12, "complex": [12, 15], "behind": 12, "spread": 12, "back": [12, 22], "split": 12, "chunk": 12, "slightli": [12, 15, 21], "account": [12, 15], "border": [12, 22], "latter": 12, "cudastreamwaitev": 12, "num_stream": 12, "clarifi": 12, "fit": [12, 18], "choic": [12, 14], "grid_size_x": 12, "grid_size_i": 12, "cudamemcpytosymbol": 12, "upload": 12, "yourself": [12, 22], "spent": [12, 22], "relat": [13, 16, 23], "famili": 13, "launcher": 13, "kt": [13, 20], "easiest": 13, "toolkit": [13, 14], "intend": 13, "Or": [13, 14], "vector_add": [13, 18, 19, 21], "10000000": 13, "512": [13, 19], "research": 13, "cite": 13, "paper": 13, "significantli": [13, 15, 17], "articl": [13, 19], "author": 13, "ben": 13, "van": 13, "werkhoven": 13, "titl": 13, "auto": [13, 15, 17, 18, 21, 22, 23], "journal": 13, "year": 13, "2019": 13, "volum": 13, "90": 13, "347": 13, "358": 13, "url": 13, "www": 13, "sciencedirect": 13, "scienc": 13, "pii": 13, "s0167739x18313359": 13, "doi": 13, "1016": 13, "2018": 13, "08": 13, "004": 13, "referenc": 13, "bayesian": [13, 18, 22], "willemsen2021bayesian": 13, "willemsen": [13, 18], "flori": 13, "jan": 13, "nieuwpoort": 13, "rob": 13, "workshop": 13, "pmb": 13, "supercomput": 13, "sc21": 13, "2021": 13, "arxiv": 13, "ab": 13, "2111": 13, "14991": 13, "difficulti": 13, "schoonhoven2022benchmark": 13, "schoonhoven": 13, "richard": 13, "batenburg": 13, "joost": 13, "ieee": 13, "transact": 13, "evolutionari": 13, "2022": 13, "consumpt": [13, 15, 17], "schoonhoven2022go": 13, "veenboer": 13, "bram": 13, "green": 13, "effici": [13, 15, 17], "steer": 13, "sc22": 13, "2211": 13, "07260": 13, "comprehens": 14, "recommend": [14, 20], "download": 14, "repo": 14, "continuum": 14, "io": 14, "miniconda3": 14, "x86_64": 14, "sh": 14, "newer": [14, 17], "nativ": 14, "prefix": 14, "home": 14, "pythonpath": 14, "bind": [14, 17], "older": 14, "troubl": 14, "retri": 14, "wiki": 14, "tiker": 14, "net": 14, "amd": [14, 17], "app": 14, "sdk": 14, "intel": 14, "appl": 14, "beignet": 14, "stack": 14, "altern": [14, 22], "navig": 14, "benvanwerkhoven": 14, "differenti": [14, 18, 22], "chanc": [14, 18, 21], "algebra": 15, "frequent": 15, "programm": [15, 17], "row": 15, "column": 15, "squar": 15, "matric": 15, "matmul_na": 15, "width": 15, "matmul_kernel": 15, "height": 15, "Of": 15, "solut": [15, 17], "realiti": 15, "contant": 15, "denot": [15, 19, 22], "sensibl": 15, "pick": 15, "word": 15, "warpsiz": 15, "namelijk": 15, "stand": 15, "briefli": 15, "figur": 15, "fifth": 15, "fourth": 15, "dramat": 15, "profil": 15, "pretti": 15, "opportun": 15, "realiz": 15, "collabor": 15, "bandwidth": 15, "techniqu": 15, "submatric": 15, "proce": 15, "matmul_shar": 15, "sa": 15, "sb": 15, "kb": 15, "outer": 15, "inner": 15, "race": 15, "drastic": 15, "due": [15, 21, 22], "fortun": 15, "benefit": 15, "redund": 15, "distinct": 15, "1xn": 15, "usag": [15, 17], "occup": 15, "goe": 15, "down": 15, "matmul": 15, "newli": 15, "coupl": 15, "respect": [15, 17], "independ": 15, "yield": 15, "discontinu": 15, "room": 15, "impos": 15, "report": [16, 17, 22, 23], "possibli": [16, 22], "_flop": 16, "total_flop": 16, "ps_energi": [16, 17, 23], "occur": [16, 22], "exhaust": 16, "brute": [16, 18, 19], "forc": [16, 18, 19, 21], "maxim": [16, 22], "boolean": [16, 17, 22], "facilit": 17, "layer": 17, "act": 17, "hook": 17, "pattern": 17, "subscrib": 17, "benchmarkobserv": 17, "overwritten": [17, 22], "extend": 17, "mandatori": 17, "get_result": 17, "aggreg": 17, "after_finish": 17, "after_start": 17, "before_start": 17, "register_configur": 17, "register_devic": 17, "variou": [17, 19], "registerobserv": 17, "track": 17, "num_reg": 17, "current_modul": 17, "powersensor2": 17, "pcie": 17, "intercept": 17, "sensor": 17, "transmit": 17, "usb": 17, "connect": 17, "advantag": 17, "instantan": 17, "frequenc": 17, "khz": 17, "pybind11": 17, "powersensor": [17, 23], "ps_power": [17, 23], "joul": [17, 23], "watt": [17, 23], "ttyacm0": 17, "core": 17, "voltag": 17, "thin": 17, "wrapper": [17, 21], "intricaci": 17, "friendli": 17, "mode": 17, "repeatedli": 17, "downsid": 17, "approach": 17, "save_al": 17, "nvidia_smi_fallback": 17, "use_locked_clock": 17, "continous_dur": 17, "monitor": 17, "clock": [17, 23], "power_read": [17, 23], "nvml_power": [17, 23], "nvml_energi": [17, 23], "core_freq": [17, 23], "mem_freq": [17, 23], "gr_voltag": 17, "ordin": 17, "identifi": 17, "smi": 17, "root": 17, "opt": 17, "amper": 17, "continuous_dur": 17, "common": [17, 21], "cap": 17, "popular": 17, "nvml_gr_clock": [17, 23], "nvml_mem_clock": [17, 23], "nvml_pwr_limit": [17, 23], "graphic": [17, 23], "jetson": 17, "rapl": 17, "xilinx": 17, "pmt": 17, "astron": 17, "nl": 17, "rd": 17, "meter": 17, "arduino": 17, "_energi": 17, "_power": 17, "acceler": 18, "prohibit": 18, "slow": 18, "wast": 18, "basin": [18, 22], "hop": [18, 22], "dual": [18, 22], "anneal": [18, 22], "evolut": [18, 22], "firefli": [18, 22], "genet": [18, 22], "greedi": [18, 22], "local": [18, 22], "multi": [18, 22], "particl": [18, 22], "swarm": [18, 22], "mechan": 18, "overrid": 18, "time_limit": [18, 22], "uniqu": [18, 22], "count": 18, "searchspac": 18, "runner": 18, "nelder": 18, "mead": 18, "powel": 18, "cg": 18, "bfg": 18, "l": 18, "tnc": 18, "cobyla": 18, "slsqp": 18, "reject": 18, "thesi": 18, "generate_normalized_param_dict": 18, "denorm": 18, "normalize_parameter_spac": 18, "param_spac": 18, "prune_parameter_spac": 18, "normalize_dict": 18, "prune": 18, "hyperparamet": 18, "popul": 18, "best1bin": 18, "best1exp": 18, "rand1exp": 18, "randtobest1exp": 18, "best2exp": 18, "rand2exp": 18, "randtobest1bin": 18, "best2bin": 18, "rand2bin": 18, "rand1bin": 18, "popsiz": 18, "maxit": 18, "constr": 18, "compute_intens": 18, "fun": 18, "intens": 18, "distance_to": 18, "euclidian": 18, "move_toward": 18, "alpha": 18, "toward": 18, "b0": 18, "attract": 18, "gamma": 18, "light": 18, "absorpt": 18, "coeffici": 18, "disruptive_uniform_crossov": 18, "dna1": 18, "dna2": 18, "disrupt": 18, "uniform": 18, "crossov": 18, "uniformli": 18, "gene": 18, "children": 18, "guarante": 18, "parent": 18, "mutat": 18, "dna": 18, "mutation_ch": 18, "single_point_crossov": 18, "single_point": 18, "two_point": 18, "disruptive_uniform": 18, "two_point_crossov": 18, "uniform_crossov": 18, "weighted_choic": 18, "probabl": [18, 22], "il": 18, "neighbor": 18, "ham": 18, "adjac": 18, "greedy": 18, "soon": 18, "no_improv": 18, "exce": 18, "50": 18, "random_walk": 18, "hillclimb": 18, "travers": 18, "inertia": 18, "c1": 18, "cognit": 18, "c2": 18, "social": 18, "fraction": 18, "acceptance_prob": 18, "old_cost": 18, "new_cost": 18, "modif": [18, 20], "po": 18, "t_min": 18, "001": 18, "995": 18, "vector_add_kernel": 19, "wise": 19, "1000000": [19, 21], "recogn": 19, "alright": 19, "portabl": 20, "stick": 20, "pointer": 20, "primit": 20, "lead": 20, "ineffici": 20, "situat": 20, "scientif": 20, "sens": 20, "experiment": 20, "pack": 20, "consult": 20, "create_receive_spec_struct": 20, "0l": 20, "pad": 20, "8byte": 20, "packstr": 20, "iiiiiiiiiiippi": 20, "fffi": 20, "nsampl": 20, "nsamplesiq": 20, "nslowtimesampl": 20, "nchannel": 20, "ntx": 20, "nrepeat": 20, "nfasttimesampl": 20, "rfsize": 20, "mnrow": 20, "mnrowsiq": 20, "nactivechannel": 20, "isiq": 20, "fsiq": 20, "fc": 20, "nbuffer": 20, "frombuff": 20, "len": 20, "receive_spec": 20, "bf": 20, "rf": 20, "recon": 20, "length": 20, "slight": 20, "matlab": 21, "typenam": 21, "my_typ": 21, "regardless": 21, "demot": 21, "rewrit": 21, "real": 21, "risk": 21, "seper": 21, "grid_div_z": 22, "06": 22, "log": 22, "auxilliari": 22, "safer": 22, "notat": 22, "divison": 22, "treat": 22, "warp": 22, "empti": 22, "kepler": 22, "plu": 22, "filter_mod": 22, "address_mod": 22, "clamp": 22, "mirror": 22, "axi": 22, "normalized_coordin": 22, "emtpi": 22, "get_local_s": 22, "satisfi": 22, "000001": 22, "ref": 22, "basinhop": 22, "bayes_opt": 22, "diff_evo": 22, "firefly_algorithm": 22, "genetic_algorithm": 22, "greedy_il": 22, "greedy_ml": 22, "ml": 22, "ordered_greedy_ml": 22, "pso": 22, "simulated_ann": 22, "sort": 22, "resourc": 22, "persist": 22, "consol": 22, "info": 22, "summar": 22, "store_result": 22, "results_filenam": 22, "typicali": 22, "percentag": 22, "create_device_target": 22, "header_filenam": 22, "target": 22, "dtarget_gpu": 22, "name_of_gpu": 22, "chosen": 22, "block_size_": 23, "grid_size_": 23, "compiler_opt_": 23, "loop_unroll_factor_": 23, "nvml_": 23, "nvmlobserv": 23}, "objects": {"kernel_tuner.backends.compiler": [[6, 0, 1, "", "CompilerFunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "cleanup_lib"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.cupy": [[6, 0, 1, "", "CupyFunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.hip": [[6, 0, 1, "", "HipFunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.nvcuda": [[6, 0, 1, "", "CudaFunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.opencl": [[6, 0, 1, "", "OpenCLFunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.backends.pycuda": [[6, 0, 1, "", "PyCudaFunctions"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "compile"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "kernel_finished"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "memcpy_htod"], [6, 1, 1, "", "memset"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "start_event"], [6, 1, 1, "", "stop_event"], [6, 1, 1, "", "synchronize"]], "kernel_tuner.core": [[6, 0, 1, "", "DeviceInterface"]], "kernel_tuner.core.DeviceInterface": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "benchmark"], [6, 1, 1, "", "benchmark_continuous"], [6, 1, 1, "", "benchmark_default"], [6, 1, 1, "", "check_kernel_output"], [6, 1, 1, "", "compile_kernel"], [6, 1, 1, "", "copy_constant_memory_args"], [6, 1, 1, "", "copy_shared_memory_args"], [6, 1, 1, "", "copy_texture_memory_args"], [6, 1, 1, "", "create_kernel_instance"], [6, 1, 1, "", "get_environment"], [6, 1, 1, "", "memcpy_dtoh"], [6, 1, 1, "", "preprocess_gpu_arguments"], [6, 1, 1, "", "ready_argument_list"], [6, 1, 1, "", "run_kernel"], [6, 1, 1, "", "set_nvml_parameters"]], "kernel_tuner": [[22, 2, 1, "", "create_device_targets"], [22, 2, 1, "", "run_kernel"], [22, 2, 1, "", "store_results"], [22, 2, 1, "", "tune_kernel"], [6, 3, 0, "-", "util"]], "kernel_tuner.observers": [[17, 0, 1, "", "BenchmarkObserver"]], "kernel_tuner.observers.BenchmarkObserver": [[17, 1, 1, "", "after_finish"], [17, 1, 1, "", "after_start"], [17, 1, 1, "", "before_start"], [17, 1, 1, "", "during"], [17, 1, 1, "", "get_results"], [17, 1, 1, "", "register_configuration"], [17, 1, 1, "", "register_device"]], "kernel_tuner.observers.nvml": [[17, 0, 1, "", "NVMLObserver"]], "kernel_tuner.observers.pmt": [[17, 0, 1, "", "PMTObserver"]], "kernel_tuner.observers.powersensor": [[17, 0, 1, "", "PowerSensorObserver"]], "kernel_tuner.runners.sequential": [[6, 0, 1, "", "SequentialRunner"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.runners.simulation": [[6, 0, 1, "", "SimulationRunner"]], "kernel_tuner.runners.simulation.SimulationRunner": [[6, 1, 1, "", "__init__"], [6, 1, 1, "", "run"]], "kernel_tuner.strategies": [[18, 3, 0, "-", "basinhopping"], [18, 3, 0, "-", "bayes_opt"], [18, 3, 0, "-", "brute_force"], [6, 3, 0, "-", "common"], [18, 3, 0, "-", "diff_evo"], [18, 3, 0, "-", "dual_annealing"], [18, 3, 0, "-", "firefly_algorithm"], [18, 3, 0, "-", "genetic_algorithm"], [18, 3, 0, "-", "greedy_ils"], [18, 3, 0, "-", "greedy_mls"], [18, 3, 0, "-", "minimize"], [18, 3, 0, "-", "mls"], [18, 3, 0, "-", "ordered_greedy_mls"], [18, 3, 0, "-", "pso"], [18, 3, 0, "-", "random_sample"], [18, 3, 0, "-", "simulated_annealing"]], "kernel_tuner.strategies.basinhopping": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.bayes_opt": [[18, 2, 1, "", "generate_normalized_param_dicts"], [18, 2, 1, "", "normalize_parameter_space"], [18, 2, 1, "", "prune_parameter_space"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.brute_force": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.common": [[6, 2, 1, "", "get_options"], [6, 2, 1, "", "get_strategy_docstring"], [6, 2, 1, "", "make_strategy_options_doc"], [6, 2, 1, "", "scale_from_params"], [6, 2, 1, "", "setup_method_arguments"], [6, 2, 1, "", "setup_method_options"], [6, 2, 1, "", "snap_to_nearest_config"], [6, 2, 1, "", "unscale_and_snap_to_nearest"]], "kernel_tuner.strategies.diff_evo": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.dual_annealing": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm": [[18, 0, 1, "", "Firefly"], [18, 2, 1, "", "tune"]], "kernel_tuner.strategies.firefly_algorithm.Firefly": [[18, 1, 1, "", "compute_intensity"], [18, 1, 1, "", "distance_to"], [18, 1, 1, "", "move_towards"]], "kernel_tuner.strategies.genetic_algorithm": [[18, 2, 1, "", "disruptive_uniform_crossover"], [18, 2, 1, "", "mutate"], [18, 2, 1, "", "single_point_crossover"], [18, 2, 1, "", "tune"], [18, 2, 1, "", "two_point_crossover"], [18, 2, 1, "", "uniform_crossover"], [18, 2, 1, "", "weighted_choice"]], "kernel_tuner.strategies.greedy_ils": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.minimize": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.pso": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.random_sample": [[18, 2, 1, "", "tune"]], "kernel_tuner.strategies.simulated_annealing": [[18, 2, 1, "", "acceptance_prob"], [18, 2, 1, "", "neighbor"], [18, 2, 1, "", "tune"]], "kernel_tuner.util": [[6, 0, 1, "", "CompilationFailedConfig"], [6, 0, 1, "", "ErrorConfig"], [6, 0, 1, "", "InvalidConfig"], [6, 0, 1, "", "NpEncoder"], [6, 0, 1, "", "RuntimeFailedConfig"], [6, 4, 1, "", "SkippableFailure"], [6, 4, 1, "", "StopCriterionReached"], [6, 2, 1, "", "check_argument_list"], [6, 2, 1, "", "check_argument_type"], [6, 2, 1, "", "check_restriction"], [6, 2, 1, "", "check_restrictions"], [6, 2, 1, "", "check_stop_criterion"], [6, 2, 1, "", "check_thread_block_dimensions"], [6, 2, 1, "", "check_tune_params_list"], [6, 2, 1, "", "compile_restrictions"], [6, 2, 1, "", "config_valid"], [6, 2, 1, "", "convert_constraint_restriction"], [6, 2, 1, "", "correct_open_cache"], [6, 2, 1, "", "cuda_error_check"], [6, 2, 1, "", "delete_temp_file"], [6, 2, 1, "", "detect_language"], [6, 2, 1, "", "dump_cache"], [6, 2, 1, "", "get_best_config"], [6, 2, 1, "", "get_config_string"], [6, 2, 1, "", "get_grid_dimensions"], [6, 2, 1, "", "get_instance_string"], [6, 2, 1, "", "get_kernel_string"], [6, 2, 1, "", "get_problem_size"], [6, 2, 1, "", "get_smem_args"], [6, 2, 1, "", "get_temp_filename"], [6, 2, 1, "", "get_thread_block_dimensions"], [6, 2, 1, "", "get_total_timings"], [6, 2, 1, "", "looks_like_a_filename"], [6, 2, 1, "", "normalize_verify_function"], [6, 2, 1, "", "parse_restrictions"], [6, 2, 1, "", "prepare_kernel_string"], [6, 2, 1, "", "print_config"], [6, 2, 1, "", "print_config_output"], [6, 2, 1, "", "process_cache"], [6, 2, 1, "", "process_metrics"], [6, 2, 1, "", "read_cache"], [6, 2, 1, "", "read_file"], [6, 2, 1, "", "replace_param_occurrences"], [6, 2, 1, "", "setup_block_and_grid"], [6, 2, 1, "", "store_cache"], [6, 2, 1, "", "to_valid_nvrtc_gpu_arch_cc"], [6, 2, 1, "", "write_file"]], "kernel_tuner.util.NpEncoder": [[6, 1, 1, "", "default"]]}, "objtypes": {"0": "py:class", "1": "py:method", "2": "py:function", "3": "py:module", "4": "py:exception"}, "objnames": {"0": ["py", "class", "Python class"], "1": ["py", "method", "Python method"], "2": ["py", "function", "Python function"], "3": ["py", "module", "Python module"], "4": ["py", "exception", "Python exception"]}, "titleterms": {"backend": [0, 6, 14, 21], "cuda": [0, 14, 15], "featur": [0, 2], "support": 0, "usag": [0, 13], "compil": [0, 6], "cach": 1, "file": 1, "The": [2, 13], "kernel": [2, 7, 8, 9, 10, 11, 13, 15, 21], "tuner": [2, 7, 8, 9, 10, 11, 13], "document": [2, 3, 6, 13, 22], "guid": [2, 3, 14], "refer": 2, "contribut": 3, "report": 3, "issu": 3, "code": [3, 7, 8, 9, 10, 12], "develop": 3, "environ": 3, "local": [3, 8], "setup": 3, "cluster": 3, "run": [3, 9], "test": [3, 4], "build": 3, "convolut": [4, 10], "2d": 4, "exampl": [4, 10, 13, 21], "implement": [4, 7, 8, 9], "tune": [4, 7, 8, 9, 11, 12, 15, 16, 17], "more": 4, "tunabl": 4, "paramet": [4, 9, 11, 17, 23], "correct": 5, "verif": 5, "design": 6, "strategi": [6, 18], "kernel_tun": [6, 18], "common": 6, "runner": 6, "sequenti": 6, "sequentialrunn": 6, "simulationrunn": 6, "devic": 6, "interfac": 6, "core": 6, "deviceinterfac": 6, "pycuda": [6, 14], "pycudafunct": 6, "cupi": 6, "cupyfunct": 6, "nvcuda": 6, "cudafunct": 6, "opencl": [6, 14], "openclfunct": 6, "compilerfunct": 6, "hip": [6, 14], "hipfunct": 6, "util": 6, "function": 6, "diffus": [7, 8, 9], "python": [7, 8, 9, 14], "comput": [7, 8, 9], "gpu": [7, 8, 9, 11], "auto": [7, 8, 9], "us": [7, 8, 9, 11, 15, 20], "share": [7, 8, 9, 15], "memori": [7, 8, 9, 15], "tile": [7, 8, 9], "store": [7, 8], "result": [7, 8], "tutori": [8, 9], "from": [8, 9], "physic": [8, 9], "best": 9, "product": 9, "c": 9, "vector": 10, "add": 10, "stencil": 10, "matrix": [10, 15], "multipl": [10, 15], "py": 10, "sepconv": 10, "convolution_correct": 10, "convolution_stream": 10, "reduct": 10, "spars": 10, "point": 10, "polygon": 10, "expdist": 10, "gener": 10, "3d": 11, "grid": 11, "let": 11, "": 11, "start": [11, 19], "cpu": 11, "move": 11, "optim": [11, 18], "host": 12, "number": 12, "stream": 12, "quick": 13, "instal": [13, 14], "citat": 13, "packag": 14, "other": 14, "pyopencl": 14, "pyhip": 14, "git": 14, "version": 14, "depend": 14, "naiv": 15, "increas": 15, "work": 15, "per": 15, "thread": 15, "metric": 16, "object": 16, "observ": 17, "powersensorobserv": 17, "nvmlobserv": 17, "execut": 17, "nvml": 17, "pmtobserv": 17, "basinhop": 18, "bayes_opt": 18, "brute_forc": 18, "diff_evo": 18, "dual_ann": 18, "firefly_algorithm": 18, "genetic_algorithm": 18, "greedy_il": 18, "greedy_ml": 18, "minim": 18, "ml": 18, "ordered_greedy_ml": 18, "pso": 18, "random_sampl": 18, "simulated_ann": 18, "get": 19, "struct": 20, "templat": 21, "select": 21, "api": 22, "vocabulari": 23}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "nbsphinx": 4, "sphinx": 58}, "alltitles": {"Backends": [[0, "backends"]], "CUDA Backends": [[0, "cuda-backends"]], "Backend feature support": [[0, "id1"]], "Backend usage and compiler": [[0, "id2"]], "Cache files": [[1, "cache-files"]], "The Kernel Tuner documentation": [[2, "the-kernel-tuner-documentation"], [13, "the-kernel-tuner-documentation"]], "Kernel Tuner": [[2, null]], "Guides": [[2, null]], "Features": [[2, null]], "Reference": [[2, null]], "Contribution guide": [[3, "contribution-guide"]], "Reporting Issues": [[3, "reporting-issues"]], "Contributing Code": [[3, "contributing-code"]], "Development environment": [[3, "development-environment"]], "Local setup": [[3, "local-setup"]], "Cluster setup": [[3, "cluster-setup"]], "Running tests": [[3, "running-tests"]], "Building documentation": [[3, "building-documentation"]], "Convolution": [[4, "Convolution"], [10, "convolution"]], "2D Convolution example": [[4, "2D-Convolution-example"]], "Implement a test": [[4, "Implement-a-test"]], "Tuning 2D Convolution": [[4, "Tuning-2D-Convolution"]], "More tunable parameters": [[4, "More-tunable-parameters"]], "Correctness Verification": [[5, "correctness-verification"]], "Design documentation": [[6, "design-documentation"]], "Strategies": [[6, "strategies"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "Runners": [[6, "runners"]], "kernel_tuner.runners.sequential.SequentialRunner": [[6, "kernel-tuner-runners-sequential-sequentialrunner"]], "kernel_tuner.runners.sequential.SimulationRunner": [[6, "kernel-tuner-runners-sequential-simulationrunner"]], "Device Interfaces": [[6, "device-interfaces"]], "kernel_tuner.core.DeviceInterface": [[6, "kernel-tuner-core-deviceinterface"]], "kernel_tuner.backends.pycuda.PyCudaFunctions": [[6, "kernel-tuner-backends-pycuda-pycudafunctions"]], "kernel_tuner.backends.cupy.CupyFunctions": [[6, "kernel-tuner-backends-cupy-cupyfunctions"]], "kernel_tuner.backends.nvcuda.CudaFunctions": [[6, "kernel-tuner-backends-nvcuda-cudafunctions"]], "kernel_tuner.backends.opencl.OpenCLFunctions": [[6, "kernel-tuner-backends-opencl-openclfunctions"]], "kernel_tuner.backends.compiler.CompilerFunctions": [[6, "kernel-tuner-backends-compiler-compilerfunctions"]], "kernel_tuner.backends.hip.HipFunctions": [[6, "kernel-tuner-backends-hip-hipfunctions"]], "Util Functions": [[6, "util-functions"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "Diffusion": [[7, "Diffusion"], [7, "id1"], [8, "Diffusion"], [9, "Diffusion"]], "Python implementation": [[7, "Python-implementation"], [8, "Python-implementation"], [9, "Python-implementation"]], "Computing on the GPU": [[7, "Computing-on-the-GPU"], [8, "Computing-on-the-GPU"], [9, "Computing-on-the-GPU"]], "Auto-Tuning with the Kernel Tuner": [[7, "Auto-Tuning-with-the-Kernel-Tuner"], [8, "Auto-Tuning-with-the-Kernel-Tuner"], [9, "Auto-Tuning-with-the-Kernel-Tuner"]], "Using Shared Memory": [[7, "Using-Shared-Memory"]], "Tiling GPU Code": [[7, "Tiling-GPU-Code"], [8, "Tiling-GPU-Code"], [9, "Tiling-GPU-Code"]], "Storing the results": [[7, "Storing-the-results"], [8, "Storing-the-results"]], "Tutorial: From physics to tuned GPU kernels": [[8, "Tutorial:-From-physics-to-tuned-GPU-kernels"], [9, "Tutorial:-From-physics-to-tuned-GPU-kernels"]], "Using Shared (local) Memory": [[8, "Using-Shared-(local)-Memory"]], "Using shared memory": [[9, "Using-shared-memory"], [15, "Using-shared-memory"]], "Using the best parameters in a production run": [[9, "Using-the-best-parameters-in-a-production-run"]], "Python run": [[9, "Python-run"]], "C run": [[9, "C-run"]], "Kernel Tuner Examples": [[10, "kernel-tuner-examples"]], "Vector Add": [[10, "vector-add"]], "Stencil": [[10, "stencil"]], "Matrix Multiplication": [[10, "matrix-multiplication"]], "convolution.py": [[10, "convolution-py"]], "sepconv.py": [[10, "sepconv-py"]], "convolution_correct.py": [[10, "convolution-correct-py"]], "convolution_streams.py": [[10, "convolution-streams-py"]], "Reduction": [[10, "reduction"]], "Sparse Matrix Vector Multiplication": [[10, "sparse-matrix-vector-multiplication"]], "Point-in-Polygon": [[10, "point-in-polygon"]], "ExpDist": [[10, "expdist"]], "Code Generator": [[10, "code-generator"]], "3D Grid on GPU with Kernel Tuner": [[11, "3D-Grid-on-GPU-with-Kernel-Tuner"]], "Let\u2019s start on the CPU": [[11, "Let's-start-on-the-CPU"]], "Let\u2019s move to the GPU": [[11, "Let's-move-to-the-GPU"]], "Tune the kernel": [[11, "Tune-the-kernel"]], "Using the optimized parameters": [[11, "Using-the-optimized-parameters"]], "Tuning Host Code": [[12, "tuning-host-code"]], "Tuning the number of streams": [[12, "tuning-the-number-of-streams"]], "Quick install": [[13, "quick-install"]], "Example usage": [[13, "example-usage"]], "Citation": [[13, "citation"]], "Installation": [[14, "installation"]], "Python": [[14, "python"]], "Installing Python Packages": [[14, "installing-python-packages"]], "CUDA and PyCUDA": [[14, "cuda-and-pycuda"]], "Other CUDA Backends": [[14, "other-cuda-backends"]], "OpenCL and PyOpenCL": [[14, "opencl-and-pyopencl"]], "HIP and PyHIP": [[14, "hip-and-pyhip"]], "Installing the git version": [[14, "installing-the-git-version"]], "Dependencies for the guides": [[14, "dependencies-for-the-guides"]], "Matrix multiplication": [[15, "Matrix-multiplication"]], "Naive CUDA kernel": [[15, "Naive-CUDA-kernel"]], "Tuning a naive kernel": [[15, "Tuning-a-naive-kernel"]], "Increase work per thread": [[15, "Increase-work-per-thread"]], "Metrics and Objectives": [[16, "metrics-and-objectives"]], "Metrics": [[16, "metrics"]], "Tuning Objectives": [[16, "tuning-objectives"]], "Observers": [[17, "observers"]], "PowerSensorObserver": [[17, "powersensorobserver"]], "NVMLObserver": [[17, "nvmlobserver"]], "Tuning execution parameters with NVML": [[17, "tuning-execution-parameters-with-nvml"]], "PMTObserver": [[17, "pmtobserver"]], "Optimization strategies": [[18, "optimization-strategies"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "Getting Started": [[19, "getting-started"]], "Using structs": [[20, "using-structs"]], "Templated kernels": [[21, "templated-kernels"]], "Example": [[21, "example"]], "Selecting a backend": [[21, "selecting-a-backend"]], "API Documentation": [[22, "api-documentation"]], "Parameter Vocabulary": [[23, "parameter-vocabulary"]]}, "indexentries": {"compilationfailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.CompilationFailedConfig"]], "compilerfunctions (class in kernel_tuner.backends.compiler)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions"]], "cudafunctions (class in kernel_tuner.backends.nvcuda)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions"]], "cupyfunctions (class in kernel_tuner.backends.cupy)": [[6, "kernel_tuner.backends.cupy.CupyFunctions"]], "deviceinterface (class in kernel_tuner.core)": [[6, "kernel_tuner.core.DeviceInterface"]], "errorconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.ErrorConfig"]], "hipfunctions (class in kernel_tuner.backends.hip)": [[6, "kernel_tuner.backends.hip.HipFunctions"]], "invalidconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.InvalidConfig"]], "npencoder (class in kernel_tuner.util)": [[6, "kernel_tuner.util.NpEncoder"]], "openclfunctions (class in kernel_tuner.backends.opencl)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions"]], "pycudafunctions (class in kernel_tuner.backends.pycuda)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions"]], "runtimefailedconfig (class in kernel_tuner.util)": [[6, "kernel_tuner.util.RuntimeFailedConfig"]], "sequentialrunner (class in kernel_tuner.runners.sequential)": [[6, "kernel_tuner.runners.sequential.SequentialRunner"]], "simulationrunner (class in kernel_tuner.runners.simulation)": [[6, "kernel_tuner.runners.simulation.SimulationRunner"]], "skippablefailure": [[6, "kernel_tuner.util.SkippableFailure"]], "stopcriterionreached": [[6, "kernel_tuner.util.StopCriterionReached"]], "__init__() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.__init__"]], "__init__() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.__init__"]], "__init__() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.__init__"]], "__init__() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.__init__"]], "__init__() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.__init__"]], "__init__() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.__init__"]], "__init__() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.__init__"]], "__init__() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.__init__"]], "__init__() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.__init__"]], "benchmark() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark"]], "benchmark_continuous() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_continuous"]], "benchmark_default() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.benchmark_default"]], "check_argument_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_list"]], "check_argument_type() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_argument_type"]], "check_kernel_output() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.check_kernel_output"]], "check_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restriction"]], "check_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_restrictions"]], "check_stop_criterion() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_stop_criterion"]], "check_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_thread_block_dimensions"]], "check_tune_params_list() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.check_tune_params_list"]], "cleanup_lib() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.cleanup_lib"]], "compile() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.compile"]], "compile() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.compile"]], "compile() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.compile"]], "compile() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.compile"]], "compile() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.compile"]], "compile() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.compile"]], "compile_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.compile_kernel"]], "compile_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.compile_restrictions"]], "config_valid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.config_valid"]], "convert_constraint_restriction() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.convert_constraint_restriction"]], "copy_constant_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_constant_memory_args"]], "copy_constant_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_constant_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_shared_memory_args"]], "copy_shared_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_shared_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.copy_texture_memory_args"]], "copy_texture_memory_args() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.copy_texture_memory_args"]], "correct_open_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.correct_open_cache"]], "create_kernel_instance() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.create_kernel_instance"]], "cuda_error_check() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.cuda_error_check"]], "default() (kernel_tuner.util.npencoder method)": [[6, "kernel_tuner.util.NpEncoder.default"]], "delete_temp_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.delete_temp_file"]], "detect_language() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.detect_language"]], "dump_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.dump_cache"]], "get_best_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_best_config"]], "get_config_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_config_string"]], "get_environment() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.get_environment"]], "get_grid_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_grid_dimensions"]], "get_instance_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_instance_string"]], "get_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_kernel_string"]], "get_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_options"]], "get_problem_size() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_problem_size"]], "get_smem_args() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_smem_args"]], "get_strategy_docstring() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.get_strategy_docstring"]], "get_temp_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_temp_filename"]], "get_thread_block_dimensions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_thread_block_dimensions"]], "get_total_timings() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.get_total_timings"]], "kernel_finished() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.kernel_finished"]], "kernel_finished() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.kernel_finished"]], "kernel_tuner.strategies.common": [[6, "module-kernel_tuner.strategies.common"]], "kernel_tuner.util": [[6, "module-kernel_tuner.util"]], "looks_like_a_filename() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.looks_like_a_filename"]], "make_strategy_options_doc() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.make_strategy_options_doc"]], "memcpy_dtoh() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_dtoh"]], "memcpy_dtoh() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.memcpy_dtoh"]], "memcpy_htod() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memcpy_htod"]], "memcpy_htod() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memcpy_htod"]], "memset() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.memset"]], "memset() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.memset"]], "memset() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.memset"]], "memset() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.memset"]], "memset() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.memset"]], "memset() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.memset"]], "module": [[6, "module-kernel_tuner.strategies.common"], [6, "module-kernel_tuner.util"], [18, "module-kernel_tuner.strategies.basinhopping"], [18, "module-kernel_tuner.strategies.bayes_opt"], [18, "module-kernel_tuner.strategies.brute_force"], [18, "module-kernel_tuner.strategies.diff_evo"], [18, "module-kernel_tuner.strategies.dual_annealing"], [18, "module-kernel_tuner.strategies.firefly_algorithm"], [18, "module-kernel_tuner.strategies.genetic_algorithm"], [18, "module-kernel_tuner.strategies.greedy_ils"], [18, "module-kernel_tuner.strategies.greedy_mls"], [18, "module-kernel_tuner.strategies.minimize"], [18, "module-kernel_tuner.strategies.mls"], [18, "module-kernel_tuner.strategies.ordered_greedy_mls"], [18, "module-kernel_tuner.strategies.pso"], [18, "module-kernel_tuner.strategies.random_sample"], [18, "module-kernel_tuner.strategies.simulated_annealing"]], "normalize_verify_function() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.normalize_verify_function"]], "parse_restrictions() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.parse_restrictions"]], "prepare_kernel_string() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.prepare_kernel_string"]], "preprocess_gpu_arguments() (kernel_tuner.core.deviceinterface static method)": [[6, "kernel_tuner.core.DeviceInterface.preprocess_gpu_arguments"]], "print_config() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config"]], "print_config_output() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.print_config_output"]], "process_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_cache"]], "process_metrics() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.process_metrics"]], "read_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_cache"]], "read_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.read_file"]], "ready_argument_list() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.ready_argument_list"]], "ready_argument_list() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.ready_argument_list"]], "replace_param_occurrences() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.replace_param_occurrences"]], "run() (kernel_tuner.runners.sequential.sequentialrunner method)": [[6, "kernel_tuner.runners.sequential.SequentialRunner.run"]], "run() (kernel_tuner.runners.simulation.simulationrunner method)": [[6, "kernel_tuner.runners.simulation.SimulationRunner.run"]], "run_kernel() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.run_kernel"]], "run_kernel() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.run_kernel"]], "run_kernel() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.run_kernel"]], "scale_from_params() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.scale_from_params"]], "set_nvml_parameters() (kernel_tuner.core.deviceinterface method)": [[6, "kernel_tuner.core.DeviceInterface.set_nvml_parameters"]], "setup_block_and_grid() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.setup_block_and_grid"]], "setup_method_arguments() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_arguments"]], "setup_method_options() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.setup_method_options"]], "snap_to_nearest_config() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.snap_to_nearest_config"]], "start_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.start_event"]], "start_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.start_event"]], "start_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.start_event"]], "start_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.start_event"]], "start_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.start_event"]], "start_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.start_event"]], "stop_event() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.nvcuda.cudafunctions method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.stop_event"]], "stop_event() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.stop_event"]], "store_cache() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.store_cache"]], "synchronize() (kernel_tuner.backends.compiler.compilerfunctions method)": [[6, "kernel_tuner.backends.compiler.CompilerFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.cupy.cupyfunctions method)": [[6, "kernel_tuner.backends.cupy.CupyFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.hip.hipfunctions method)": [[6, "kernel_tuner.backends.hip.HipFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.nvcuda.cudafunctions static method)": [[6, "kernel_tuner.backends.nvcuda.CudaFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.opencl.openclfunctions method)": [[6, "kernel_tuner.backends.opencl.OpenCLFunctions.synchronize"]], "synchronize() (kernel_tuner.backends.pycuda.pycudafunctions method)": [[6, "kernel_tuner.backends.pycuda.PyCudaFunctions.synchronize"]], "to_valid_nvrtc_gpu_arch_cc() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.to_valid_nvrtc_gpu_arch_cc"]], "unscale_and_snap_to_nearest() (in module kernel_tuner.strategies.common)": [[6, "kernel_tuner.strategies.common.unscale_and_snap_to_nearest"]], "write_file() (in module kernel_tuner.util)": [[6, "kernel_tuner.util.write_file"]], "benchmarkobserver (class in kernel_tuner.observers)": [[17, "kernel_tuner.observers.BenchmarkObserver"]], "nvmlobserver (class in kernel_tuner.observers.nvml)": [[17, "kernel_tuner.observers.nvml.NVMLObserver"]], "pmtobserver (class in kernel_tuner.observers.pmt)": [[17, "kernel_tuner.observers.pmt.PMTObserver"]], "powersensorobserver (class in kernel_tuner.observers.powersensor)": [[17, "kernel_tuner.observers.powersensor.PowerSensorObserver"]], "after_finish() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_finish"]], "after_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.after_start"]], "before_start() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.before_start"]], "during() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.during"]], "get_results() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.get_results"]], "register_configuration() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_configuration"]], "register_device() (kernel_tuner.observers.benchmarkobserver method)": [[17, "kernel_tuner.observers.BenchmarkObserver.register_device"]], "firefly (class in kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly"]], "acceptance_prob() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.acceptance_prob"]], "compute_intensity() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.compute_intensity"]], "disruptive_uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.disruptive_uniform_crossover"]], "distance_to() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.distance_to"]], "generate_normalized_param_dicts() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.generate_normalized_param_dicts"]], "kernel_tuner.strategies.basinhopping": [[18, "module-kernel_tuner.strategies.basinhopping"]], "kernel_tuner.strategies.bayes_opt": [[18, "module-kernel_tuner.strategies.bayes_opt"]], "kernel_tuner.strategies.brute_force": [[18, "module-kernel_tuner.strategies.brute_force"]], "kernel_tuner.strategies.diff_evo": [[18, "module-kernel_tuner.strategies.diff_evo"]], "kernel_tuner.strategies.dual_annealing": [[18, "module-kernel_tuner.strategies.dual_annealing"]], "kernel_tuner.strategies.firefly_algorithm": [[18, "module-kernel_tuner.strategies.firefly_algorithm"]], "kernel_tuner.strategies.genetic_algorithm": [[18, "module-kernel_tuner.strategies.genetic_algorithm"]], "kernel_tuner.strategies.greedy_ils": [[18, "module-kernel_tuner.strategies.greedy_ils"]], "kernel_tuner.strategies.greedy_mls": [[18, "module-kernel_tuner.strategies.greedy_mls"]], "kernel_tuner.strategies.minimize": [[18, "module-kernel_tuner.strategies.minimize"]], "kernel_tuner.strategies.mls": [[18, "module-kernel_tuner.strategies.mls"]], "kernel_tuner.strategies.ordered_greedy_mls": [[18, "module-kernel_tuner.strategies.ordered_greedy_mls"]], "kernel_tuner.strategies.pso": [[18, "module-kernel_tuner.strategies.pso"]], "kernel_tuner.strategies.random_sample": [[18, "module-kernel_tuner.strategies.random_sample"]], "kernel_tuner.strategies.simulated_annealing": [[18, "module-kernel_tuner.strategies.simulated_annealing"]], "move_towards() (kernel_tuner.strategies.firefly_algorithm.firefly method)": [[18, "kernel_tuner.strategies.firefly_algorithm.Firefly.move_towards"]], "mutate() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.mutate"]], "neighbor() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.neighbor"]], "normalize_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.normalize_parameter_space"]], "prune_parameter_space() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.prune_parameter_space"]], "single_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.single_point_crossover"]], "tune() (in module kernel_tuner.strategies.basinhopping)": [[18, "kernel_tuner.strategies.basinhopping.tune"]], "tune() (in module kernel_tuner.strategies.bayes_opt)": [[18, "kernel_tuner.strategies.bayes_opt.tune"]], "tune() (in module kernel_tuner.strategies.brute_force)": [[18, "kernel_tuner.strategies.brute_force.tune"]], "tune() (in module kernel_tuner.strategies.diff_evo)": [[18, "kernel_tuner.strategies.diff_evo.tune"]], "tune() (in module kernel_tuner.strategies.dual_annealing)": [[18, "kernel_tuner.strategies.dual_annealing.tune"]], "tune() (in module kernel_tuner.strategies.firefly_algorithm)": [[18, "kernel_tuner.strategies.firefly_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.tune"]], "tune() (in module kernel_tuner.strategies.greedy_ils)": [[18, "kernel_tuner.strategies.greedy_ils.tune"]], "tune() (in module kernel_tuner.strategies.greedy_mls)": [[18, "kernel_tuner.strategies.greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.minimize)": [[18, "kernel_tuner.strategies.minimize.tune"]], "tune() (in module kernel_tuner.strategies.mls)": [[18, "kernel_tuner.strategies.mls.tune"]], "tune() (in module kernel_tuner.strategies.ordered_greedy_mls)": [[18, "kernel_tuner.strategies.ordered_greedy_mls.tune"]], "tune() (in module kernel_tuner.strategies.pso)": [[18, "kernel_tuner.strategies.pso.tune"]], "tune() (in module kernel_tuner.strategies.random_sample)": [[18, "kernel_tuner.strategies.random_sample.tune"]], "tune() (in module kernel_tuner.strategies.simulated_annealing)": [[18, "kernel_tuner.strategies.simulated_annealing.tune"]], "two_point_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.two_point_crossover"]], "uniform_crossover() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.uniform_crossover"]], "weighted_choice() (in module kernel_tuner.strategies.genetic_algorithm)": [[18, "kernel_tuner.strategies.genetic_algorithm.weighted_choice"]], "create_device_targets() (in module kernel_tuner)": [[22, "kernel_tuner.create_device_targets"]], "run_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.run_kernel"]], "store_results() (in module kernel_tuner)": [[22, "kernel_tuner.store_results"]], "tune_kernel() (in module kernel_tuner)": [[22, "kernel_tuner.tune_kernel"]]}}) \ No newline at end of file