diff --git a/benchmarks/data.js b/benchmarks/data.js index 446cd748..762ac288 100644 --- a/benchmarks/data.js +++ b/benchmarks/data.js @@ -1,5 +1,5 @@ window.BENCHMARK_DATA = { - "lastUpdate": 1724443708207, + "lastUpdate": 1724454030302, "repoUrl": "https://github.com/LuxDL/LuxLib.jl", "entries": { "LuxLib Benchmarks": [ @@ -126974,6 +126974,9498 @@ window.BENCHMARK_DATA = { "extra": "gctime=0\nmemory=77624\nallocs=1666\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" } ] + }, + { + "commit": { + "author": { + "email": "avikpal@mit.edu", + "name": "Avik Pal", + "username": "avik-pal" + }, + "committer": { + "email": "avik.pal.2017@gmail.com", + "name": "Avik Pal", + "username": "avik-pal" + }, + "distinct": true, + "id": "45fee0a4ac07b4907d153d927a6bdc10b77f20bc", + "message": "test: simple enzyme forward test to check no crash", + "timestamp": "2024-08-23T14:57:51-07:00", + "tree_id": "0c0785d421a458f0b84b084dd4bf5a0971136f9e", + "url": "https://github.com/LuxDL/LuxLib.jl/commit/45fee0a4ac07b4907d153d927a6bdc10b77f20bc" + }, + "date": 1724454029707, + "tool": "julia", + "benches": [ + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 6645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5958, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7792, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5437, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 117530, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 851584, + "unit": "ns", + "extra": "gctime=0\nmemory=40528\nallocs=1247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 424104, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=532\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9834, + "unit": "ns", + "extra": "gctime=0\nmemory=15136\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=15136\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10000, + "unit": "ns", + "extra": "gctime=0\nmemory=15136\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9708, + "unit": "ns", + "extra": "gctime=0\nmemory=15136\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 538763, + "unit": "ns", + "extra": "gctime=0\nmemory=73824\nallocs=2886\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 2566416, + "unit": "ns", + "extra": "gctime=0\nmemory=233136\nallocs=7642\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 685807, + "unit": "ns", + "extra": "gctime=0\nmemory=54496\nallocs=1194\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 1625, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 2416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 1895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 2958, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 21687, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 212666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7160\nallocs=233\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 37011, + "unit": "ns", + "extra": "gctime=0\nmemory=1888\nallocs=70\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 3958, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 3854, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 3937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 4354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 143890, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 1548375, + "unit": "ns", + "extra": "gctime=0\nmemory=55984\nallocs=1868\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 150252, + "unit": "ns", + "extra": "gctime=0\nmemory=16344\nallocs=385\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 57792, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39959, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46666, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83333, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 37041, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1028896, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 82890.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=208\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2029520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7355648\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2096833, + "unit": "ns", + "extra": "gctime=0\nmemory=7355648\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2092209, + "unit": "ns", + "extra": "gctime=0\nmemory=7355648\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1990709, + "unit": "ns", + "extra": "gctime=0\nmemory=7355648\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 227911.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21240\nallocs=777\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 7599333, + "unit": "ns", + "extra": "gctime=0\nmemory=211512\nallocs=6761\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1314332, + "unit": "ns", + "extra": "gctime=0\nmemory=94088\nallocs=2266\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 147417, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 150979, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 152333, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 152708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 165380, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1531291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56176\nallocs=1803\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 174842, + "unit": "ns", + "extra": "gctime=0\nmemory=18960\nallocs=437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1106333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8400368\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1125708, + "unit": "ns", + "extra": "gctime=0\nmemory=8400368\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1115083, + "unit": "ns", + "extra": "gctime=0\nmemory=8400368\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1115937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8400368\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 695023, + "unit": "ns", + "extra": "gctime=0\nmemory=83920\nallocs=3063\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 5817541, + "unit": "ns", + "extra": "gctime=0\nmemory=280824\nallocs=8736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1029500, + "unit": "ns", + "extra": "gctime=0\nmemory=74560\nallocs=1556\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5125, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 4458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6083, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5000.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 91412.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 448979, + "unit": "ns", + "extra": "gctime=0\nmemory=37584\nallocs=1260\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 60800, + "unit": "ns", + "extra": "gctime=0\nmemory=10896\nallocs=270\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8666, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8792, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9042, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8792, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 598490, + "unit": "ns", + "extra": "gctime=0\nmemory=75968\nallocs=3041\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5903917, + "unit": "ns", + "extra": "gctime=0\nmemory=244040\nallocs=8111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 389469, + "unit": "ns", + "extra": "gctime=0\nmemory=56560\nallocs=1238\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 20667, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 19250, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 19958, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 18541, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 68484, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1311833, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 73831, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=333\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 212750, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 220458, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 220250, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 221500, + "unit": "ns", + "extra": "gctime=0\nmemory=813456\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 354536, + "unit": "ns", + "extra": "gctime=0\nmemory=64176\nallocs=1466\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 5845458, + "unit": "ns", + "extra": "gctime=0\nmemory=171040\nallocs=4725\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 474284, + "unit": "ns", + "extra": "gctime=0\nmemory=68112\nallocs=1148\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 667, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 833, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 20000, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 294458, + "unit": "ns", + "extra": "gctime=0\nmemory=6880\nallocs=231\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 32961, + "unit": "ns", + "extra": "gctime=0\nmemory=1888\nallocs=70\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1583, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1583, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 123534, + "unit": "ns", + "extra": "gctime=0\nmemory=14616\nallocs=562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1493874.5, + "unit": "ns", + "extra": "gctime=0\nmemory=54336\nallocs=1843\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 126886, + "unit": "ns", + "extra": "gctime=0\nmemory=13448\nallocs=319\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7458, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5334, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10292, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 24129, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 615729, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 47500, + "unit": "ns", + "extra": "gctime=0\nmemory=6576\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 230041, + "unit": "ns", + "extra": "gctime=0\nmemory=933072\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 237500, + "unit": "ns", + "extra": "gctime=0\nmemory=933072\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 262792, + "unit": "ns", + "extra": "gctime=0\nmemory=933072\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 233458, + "unit": "ns", + "extra": "gctime=0\nmemory=933072\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 182530.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19384\nallocs=711\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 9080458, + "unit": "ns", + "extra": "gctime=0\nmemory=247640\nallocs=8001\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 610315.5, + "unit": "ns", + "extra": "gctime=0\nmemory=79208\nallocs=1823\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 4125, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 23152, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 224833, + "unit": "ns", + "extra": "gctime=0\nmemory=7384\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 49110, + "unit": "ns", + "extra": "gctime=0\nmemory=2064\nallocs=84\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16833, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16584, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 17084, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16916, + "unit": "ns", + "extra": "gctime=0\nmemory=71920\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 192479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15672\nallocs=581\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 965958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49448\nallocs=1643\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 178772, + "unit": "ns", + "extra": "gctime=0\nmemory=13768\nallocs=346\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 509792, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 332375, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 404500, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 864583, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 113511.5, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 474250, + "unit": "ns", + "extra": "gctime=0\nmemory=7408\nallocs=264\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 248712, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2322292, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1755583, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 2033750, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3280334, + "unit": "ns", + "extra": "gctime=0\nmemory=2360672\nallocs=33\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 238729, + "unit": "ns", + "extra": "gctime=0\nmemory=18568\nallocs=713\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 1974104, + "unit": "ns", + "extra": "gctime=0\nmemory=58840\nallocs=1949\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 760717.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16760\nallocs=520\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 6292, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 7583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 5875, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 90935.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 743625, + "unit": "ns", + "extra": "gctime=0\nmemory=36856\nallocs=1231\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 60411, + "unit": "ns", + "extra": "gctime=0\nmemory=10944\nallocs=273\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 11625, + "unit": "ns", + "extra": "gctime=0\nmemory=51120\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 11958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=51120\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 12271, + "unit": "ns", + "extra": "gctime=0\nmemory=51120\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 12396, + "unit": "ns", + "extra": "gctime=0\nmemory=51120\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 637206, + "unit": "ns", + "extra": "gctime=0\nmemory=72848\nallocs=2885\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5519292, + "unit": "ns", + "extra": "gctime=0\nmemory=231728\nallocs=7544\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 414184, + "unit": "ns", + "extra": "gctime=0\nmemory=54128\nallocs=1185\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 23373, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 325562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7696\nallocs=277\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 54120, + "unit": "ns", + "extra": "gctime=0\nmemory=1888\nallocs=70\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2084, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2083, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 2167, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=4880\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 220940, + "unit": "ns", + "extra": "gctime=0\nmemory=15592\nallocs=578\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1999750, + "unit": "ns", + "extra": "gctime=0\nmemory=59120\nallocs=2034\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 182151, + "unit": "ns", + "extra": "gctime=0\nmemory=14408\nallocs=343\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 8959, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 8917, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9458, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 107673.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12752\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 816459, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 75421, + "unit": "ns", + "extra": "gctime=0\nmemory=16128\nallocs=351\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 18145.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 17834, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 18875, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 18563, + "unit": "ns", + "extra": "gctime=0\nmemory=60944\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 617019, + "unit": "ns", + "extra": "gctime=0\nmemory=73832\nallocs=1708\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5305458, + "unit": "ns", + "extra": "gctime=0\nmemory=190776\nallocs=5371\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 387888.5, + "unit": "ns", + "extra": "gctime=0\nmemory=77832\nallocs=1299\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 541, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 35086, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 440000, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 45921, + "unit": "ns", + "extra": "gctime=0\nmemory=6480\nallocs=173\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 10750, + "unit": "ns", + "extra": "gctime=0\nmemory=40448\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10479, + "unit": "ns", + "extra": "gctime=0\nmemory=40448\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40448\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10250, + "unit": "ns", + "extra": "gctime=0\nmemory=40448\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 261459, + "unit": "ns", + "extra": "gctime=0\nmemory=20192\nallocs=724\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4700125, + "unit": "ns", + "extra": "gctime=0\nmemory=152672\nallocs=5001\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 373724, + "unit": "ns", + "extra": "gctime=0\nmemory=49312\nallocs=1213\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 396937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 215458, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 288000, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 756125, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 112222, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=48\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/Metal", + "value": 396166, + "unit": "ns", + "extra": "gctime=0\nmemory=2736\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 76981, + "unit": "ns", + "extra": "gctime=0\nmemory=2064\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1455708, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 857833, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1133208, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2441333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1836144\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 205916, + "unit": "ns", + "extra": "gctime=0\nmemory=13808\nallocs=502\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/Metal", + "value": 1638958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=45152\nallocs=1518\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 325533, + "unit": "ns", + "extra": "gctime=0\nmemory=12944\nallocs=380\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 6916, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7833, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8417, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7208, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 152876.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 451312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 62081, + "unit": "ns", + "extra": "gctime=0\nmemory=10800\nallocs=264\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 15708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14854, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 14521, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 16625, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 1044876, + "unit": "ns", + "extra": "gctime=0\nmemory=78192\nallocs=3110\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5855584, + "unit": "ns", + "extra": "gctime=0\nmemory=248432\nallocs=8060\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 433489.5, + "unit": "ns", + "extra": "gctime=0\nmemory=59328\nallocs=1293\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 31750, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 28417, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 28166, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 25875, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 225622, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 631917, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 118121, + "unit": "ns", + "extra": "gctime=0\nmemory=18304\nallocs=396\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 112958, + "unit": "ns", + "extra": "gctime=0\nmemory=1060336\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 103792, + "unit": "ns", + "extra": "gctime=0\nmemory=1060336\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 151396, + "unit": "ns", + "extra": "gctime=0\nmemory=1060336\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 146291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1060336\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1190197.5, + "unit": "ns", + "extra": "gctime=0\nmemory=83920\nallocs=3063\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 5968062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=256368\nallocs=7902\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 597156, + "unit": "ns", + "extra": "gctime=0\nmemory=70336\nallocs=1439\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 88250, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 77750, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 78291, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 74166, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 231353, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 537791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 127991.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18304\nallocs=396\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 279770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 306041, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 319103.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 301792, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1223547, + "unit": "ns", + "extra": "gctime=0\nmemory=90592\nallocs=3393\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6540125, + "unit": "ns", + "extra": "gctime=0\nmemory=288696\nallocs=8884\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 698201.5, + "unit": "ns", + "extra": "gctime=0\nmemory=75808\nallocs=1556\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 17042, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 17250, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 17146, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 16771, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 164086, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 451167, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 238683, + "unit": "ns", + "extra": "gctime=0\nmemory=13424\nallocs=428\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 27354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 24833, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 27312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 27749.5, + "unit": "ns", + "extra": "gctime=0\nmemory=89120\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 1038807, + "unit": "ns", + "extra": "gctime=0\nmemory=79248\nallocs=3154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 6017250, + "unit": "ns", + "extra": "gctime=0\nmemory=248240\nallocs=8054\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 701206, + "unit": "ns", + "extra": "gctime=0\nmemory=59728\nallocs=1299\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 11395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 12250, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 12375, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 11333, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 139180, + "unit": "ns", + "extra": "gctime=0\nmemory=12496\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 842812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 241222, + "unit": "ns", + "extra": "gctime=0\nmemory=22416\nallocs=602\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 21687, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 21271, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 22125, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 22625, + "unit": "ns", + "extra": "gctime=0\nmemory=52368\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 749757, + "unit": "ns", + "extra": "gctime=0\nmemory=62104\nallocs=1758\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5233187, + "unit": "ns", + "extra": "gctime=0\nmemory=179904\nallocs=5393\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 686992, + "unit": "ns", + "extra": "gctime=0\nmemory=66264\nallocs=1365\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 66999.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 63750, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 65708, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 63479, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 120818, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1336750, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 228922.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16816\nallocs=344\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 484146, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 449000, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 445542, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 448916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1064816\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 559738.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56264\nallocs=1599\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6432542, + "unit": "ns", + "extra": "gctime=0\nmemory=169656\nallocs=4968\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 721207, + "unit": "ns", + "extra": "gctime=0\nmemory=58664\nallocs=1178\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 7834, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 7458, + "unit": "ns", + "extra": "gctime=0\nmemory=13728\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 161448.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 449750, + "unit": "ns", + "extra": "gctime=0\nmemory=38232\nallocs=1254\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 59581, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13708, + "unit": "ns", + "extra": "gctime=0\nmemory=80352\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14625, + "unit": "ns", + "extra": "gctime=0\nmemory=80352\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 15667, + "unit": "ns", + "extra": "gctime=0\nmemory=80352\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 14541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=80352\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 1019503, + "unit": "ns", + "extra": "gctime=0\nmemory=75120\nallocs=2979\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5626021, + "unit": "ns", + "extra": "gctime=0\nmemory=235968\nallocs=7662\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 403264, + "unit": "ns", + "extra": "gctime=0\nmemory=57328\nallocs=1244\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/2 thread(s)", + "value": 6154458, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/4 thread(s)", + "value": 3223916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/8 thread(s)", + "value": 6375625, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/CPU/1 thread(s)", + "value": 11909375, + "unit": "ns", + "extra": "gctime=0\nmemory=4194368\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/CUDA", + "value": 301453, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/forward/GPU/AMDGPU", + "value": 294773, + "unit": "ns", + "extra": "gctime=0\nmemory=5472\nallocs=294\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 19093562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 11085604, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 19938020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16778320\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 36629791, + "unit": "ns", + "extra": "gctime=0\nmemory=16778128\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/CUDA", + "value": 1145132, + "unit": "ns", + "extra": "gctime=0\nmemory=11824\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=4)/zygote/GPU/AMDGPU", + "value": 1161766.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20192\nallocs=583\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 958, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1000, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 958, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 958, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 23494, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 329625, + "unit": "ns", + "extra": "gctime=0\nmemory=7696\nallocs=277\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 216242, + "unit": "ns", + "extra": "gctime=0\nmemory=8320\nallocs=330\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3667, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3708, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3750, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3667, + "unit": "ns", + "extra": "gctime=0\nmemory=5776\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 296983, + "unit": "ns", + "extra": "gctime=0\nmemory=17960\nallocs=683\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 2127729, + "unit": "ns", + "extra": "gctime=0\nmemory=64992\nallocs=2231\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 647936, + "unit": "ns", + "extra": "gctime=0\nmemory=17576\nallocs=495\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8334, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8041, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 9688, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 7917, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 136939, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 738708, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 67671, + "unit": "ns", + "extra": "gctime=0\nmemory=15920\nallocs=338\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 11584, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 11792, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 12750, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 12625, + "unit": "ns", + "extra": "gctime=0\nmemory=34432\nallocs=124\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 721402, + "unit": "ns", + "extra": "gctime=0\nmemory=71096\nallocs=1622\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5162750, + "unit": "ns", + "extra": "gctime=0\nmemory=188288\nallocs=5386\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 363024, + "unit": "ns", + "extra": "gctime=0\nmemory=74088\nallocs=1180\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/CUDA", + "value": 22847, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/Metal", + "value": 317792, + "unit": "ns", + "extra": "gctime=0\nmemory=7104\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/forward/GPU/AMDGPU", + "value": 51281, + "unit": "ns", + "extra": "gctime=0\nmemory=1968\nallocs=78\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3000, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2875, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3125, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2875, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/CUDA", + "value": 210440.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13448\nallocs=512\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/Metal", + "value": 1577208, + "unit": "ns", + "extra": "gctime=0\nmemory=46784\nallocs=1608\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=relu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 160702, + "unit": "ns", + "extra": "gctime=0\nmemory=10680\nallocs=268\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 10625, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 11709, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 13167, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 11167, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 138633, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 907792, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 239227, + "unit": "ns", + "extra": "gctime=0\nmemory=18720\nallocs=538\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 21062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 23604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 21416.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 20958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48912\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 652488.5, + "unit": "ns", + "extra": "gctime=0\nmemory=54152\nallocs=1599\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4746874.5, + "unit": "ns", + "extra": "gctime=0\nmemory=153416\nallocs=4612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 654136, + "unit": "ns", + "extra": "gctime=0\nmemory=55992\nallocs=1179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 4416, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4542, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 4375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 4334, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/CUDA", + "value": 24791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/Metal", + "value": 227833, + "unit": "ns", + "extra": "gctime=0\nmemory=7976\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/forward/GPU/AMDGPU", + "value": 52640, + "unit": "ns", + "extra": "gctime=0\nmemory=2176\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 16542, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 16375, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 16708, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 16625, + "unit": "ns", + "extra": "gctime=0\nmemory=71488\nallocs=15\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/CUDA", + "value": 351658.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17816\nallocs=647\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/Metal", + "value": 1485875, + "unit": "ns", + "extra": "gctime=0\nmemory=61216\nallocs=2053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=relu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 212337, + "unit": "ns", + "extra": "gctime=0\nmemory=17672\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 1958, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 2000, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 36523, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 462667, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 207782, + "unit": "ns", + "extra": "gctime=0\nmemory=12848\nallocs=429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 19125, + "unit": "ns", + "extra": "gctime=0\nmemory=49152\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 18458, + "unit": "ns", + "extra": "gctime=0\nmemory=49152\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 18833, + "unit": "ns", + "extra": "gctime=0\nmemory=49152\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 18125, + "unit": "ns", + "extra": "gctime=0\nmemory=49152\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 303670, + "unit": "ns", + "extra": "gctime=0\nmemory=23592\nallocs=890\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4639708, + "unit": "ns", + "extra": "gctime=0\nmemory=161880\nallocs=5261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 702222, + "unit": "ns", + "extra": "gctime=0\nmemory=51960\nallocs=1325\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/2 thread(s)", + "value": 60208, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/4 thread(s)", + "value": 60959, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/8 thread(s)", + "value": 65604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/CPU/1 thread(s)", + "value": 51083, + "unit": "ns", + "extra": "gctime=0\nmemory=524352\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/CUDA", + "value": 66579, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/forward/GPU/AMDGPU", + "value": 95291, + "unit": "ns", + "extra": "gctime=0\nmemory=14208\nallocs=66\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 199667, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 115166, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 119292, + "unit": "ns", + "extra": "gctime=0\nmemory=2098256\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 295479, + "unit": "ns", + "extra": "gctime=0\nmemory=2098064\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/CUDA", + "value": 229950, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=512)/zygote/GPU/AMDGPU", + "value": 584990.5, + "unit": "ns", + "extra": "gctime=0\nmemory=55040\nallocs=439\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 86146, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 81709, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 87667, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 127312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 192185, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1983854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 169421, + "unit": "ns", + "extra": "gctime=0\nmemory=24240\nallocs=514\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1920229, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1925709, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1919895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1914750, + "unit": "ns", + "extra": "gctime=0\nmemory=6318480\nallocs=127\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 573852, + "unit": "ns", + "extra": "gctime=0\nmemory=64208\nallocs=1468\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9135542, + "unit": "ns", + "extra": "gctime=0\nmemory=188160\nallocs=5313\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1085326, + "unit": "ns", + "extra": "gctime=0\nmemory=88160\nallocs=1694\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 21768, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/Metal", + "value": 353875, + "unit": "ns", + "extra": "gctime=0\nmemory=7072\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 46981, + "unit": "ns", + "extra": "gctime=0\nmemory=1936\nallocs=76\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 1833, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 1875, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 1792, + "unit": "ns", + "extra": "gctime=0\nmemory=3584\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 267259, + "unit": "ns", + "extra": "gctime=0\nmemory=13200\nallocs=472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/Metal", + "value": 1021146, + "unit": "ns", + "extra": "gctime=0\nmemory=52520\nallocs=1821\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=true, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 185402, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=329\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8250, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10625, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 9000, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 134677.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 887250, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 239022, + "unit": "ns", + "extra": "gctime=0\nmemory=21872\nallocs=593\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9500, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 11709, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9208, + "unit": "ns", + "extra": "gctime=0\nmemory=18896\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 583672, + "unit": "ns", + "extra": "gctime=0\nmemory=51672\nallocs=1514\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4182708, + "unit": "ns", + "extra": "gctime=0\nmemory=150864\nallocs=4623\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 636841.5, + "unit": "ns", + "extra": "gctime=0\nmemory=52680\nallocs=1087\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39791, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46708, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 82834, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 40507, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1125958, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 76281, + "unit": "ns", + "extra": "gctime=0\nmemory=6832\nallocs=182\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1893542, + "unit": "ns", + "extra": "gctime=0\nmemory=6308192\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1983291, + "unit": "ns", + "extra": "gctime=0\nmemory=6308192\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1983291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=6308192\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1902916, + "unit": "ns", + "extra": "gctime=0\nmemory=6308192\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 236090, + "unit": "ns", + "extra": "gctime=0\nmemory=22144\nallocs=649\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11366312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=240632\nallocs=7788\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1031795, + "unit": "ns", + "extra": "gctime=0\nmemory=83456\nallocs=1981\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 417500, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 418750, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 421625, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 417042, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 235292.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 542791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 285283, + "unit": "ns", + "extra": "gctime=0\nmemory=21712\nallocs=609\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 772062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 706208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 697646.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 762583, + "unit": "ns", + "extra": "gctime=0\nmemory=2508432\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1149490, + "unit": "ns", + "extra": "gctime=0\nmemory=91648\nallocs=3437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6544625, + "unit": "ns", + "extra": "gctime=0\nmemory=289016\nallocs=8896\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 922659, + "unit": "ns", + "extra": "gctime=0\nmemory=76128\nallocs=1558\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 3408770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 3443353.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 3457250, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 3456792, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 171296, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1419875, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 409534, + "unit": "ns", + "extra": "gctime=0\nmemory=19824\nallocs=491\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 6173000, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 6227062, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 6229500, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 6220208, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 1077389, + "unit": "ns", + "extra": "gctime=0\nmemory=91648\nallocs=3437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 7523396, + "unit": "ns", + "extra": "gctime=0\nmemory=314112\nallocs=9747\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1562795, + "unit": "ns", + "extra": "gctime=0\nmemory=79616\nallocs=1629\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 471979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 254250, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 341666, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 903750, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 47482, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 444459, + "unit": "ns", + "extra": "gctime=0\nmemory=8016\nallocs=281\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 250912, + "unit": "ns", + "extra": "gctime=0\nmemory=2016\nallocs=78\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 2323333, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 1760000, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 2038125, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 3288375, + "unit": "ns", + "extra": "gctime=0\nmemory=2362080\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 256085, + "unit": "ns", + "extra": "gctime=0\nmemory=20504\nallocs=772\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 2239750, + "unit": "ns", + "extra": "gctime=0\nmemory=75576\nallocs=2533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 788328, + "unit": "ns", + "extra": "gctime=0\nmemory=21320\nallocs=614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 57770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39583, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46209, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 82958, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 28615, + "unit": "ns", + "extra": "gctime=0\nmemory=2800\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1133833, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 76141, + "unit": "ns", + "extra": "gctime=0\nmemory=7248\nallocs=208\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2030708, + "unit": "ns", + "extra": "gctime=0\nmemory=7356832\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2061250, + "unit": "ns", + "extra": "gctime=0\nmemory=7356832\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2095541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7356832\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1953708, + "unit": "ns", + "extra": "gctime=0\nmemory=7356832\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 235210, + "unit": "ns", + "extra": "gctime=0\nmemory=19384\nallocs=711\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11813583, + "unit": "ns", + "extra": "gctime=0\nmemory=247640\nallocs=8001\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1051380, + "unit": "ns", + "extra": "gctime=0\nmemory=83912\nallocs=1970\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 58041, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 39500, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 46583, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 49005, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1119604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 74191, + "unit": "ns", + "extra": "gctime=0\nmemory=6688\nallocs=183\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1921625, + "unit": "ns", + "extra": "gctime=0\nmemory=6307008\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1977958, + "unit": "ns", + "extra": "gctime=0\nmemory=6307008\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1975834, + "unit": "ns", + "extra": "gctime=0\nmemory=6307008\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1880167, + "unit": "ns", + "extra": "gctime=0\nmemory=6307008\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 241693.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19648\nallocs=695\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9953791, + "unit": "ns", + "extra": "gctime=0\nmemory=202400\nallocs=6495\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 942554.5, + "unit": "ns", + "extra": "gctime=0\nmemory=78624\nallocs=1628\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 34479, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 430437, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 50680, + "unit": "ns", + "extra": "gctime=0\nmemory=6576\nallocs=179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 6792, + "unit": "ns", + "extra": "gctime=0\nmemory=16784\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16784\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16784\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7750, + "unit": "ns", + "extra": "gctime=0\nmemory=16784\nallocs=92\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 203416, + "unit": "ns", + "extra": "gctime=0\nmemory=17968\nallocs=655\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4793624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=150536\nallocs=5007\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 372109, + "unit": "ns", + "extra": "gctime=0\nmemory=46208\nallocs=1134\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/2 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/4 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/8 thread(s)", + "value": 291, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/CPU/1 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/CUDA", + "value": 31576, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/Metal", + "value": 256375, + "unit": "ns", + "extra": "gctime=0\nmemory=2496\nallocs=97\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/forward/GPU/AMDGPU", + "value": 41481, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=54\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2625, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2959, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3708, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3083, + "unit": "ns", + "extra": "gctime=0\nmemory=4288\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/CUDA", + "value": 195018, + "unit": "ns", + "extra": "gctime=0\nmemory=11104\nallocs=403\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/Metal", + "value": 948875.5, + "unit": "ns", + "extra": "gctime=0\nmemory=36200\nallocs=1247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=identity)(2 x 128)/zygote/GPU/AMDGPU", + "value": 155612, + "unit": "ns", + "extra": "gctime=0\nmemory=8640\nallocs=226\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 421875, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 441709, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 429167, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 435458, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 141479, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 2056458, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 371433.5, + "unit": "ns", + "extra": "gctime=0\nmemory=30608\nallocs=745\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3799959, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3818354, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3807708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3804750, + "unit": "ns", + "extra": "gctime=0\nmemory=8408304\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 767195, + "unit": "ns", + "extra": "gctime=0\nmemory=69016\nallocs=1898\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10786853.5, + "unit": "ns", + "extra": "gctime=0\nmemory=230376\nallocs=6922\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1321443, + "unit": "ns", + "extra": "gctime=0\nmemory=96008\nallocs=2083\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/2 thread(s)", + "value": 49849770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/4 thread(s)", + "value": 26028875, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/8 thread(s)", + "value": 35533000, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/CPU/1 thread(s)", + "value": 98215750, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/CUDA", + "value": 1616583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/forward/GPU/AMDGPU", + "value": 1008870, + "unit": "ns", + "extra": "gctime=0\nmemory=9328\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 154650938, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 88783312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 112429209, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 294910479, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/CUDA", + "value": 6458815, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=32)/zygote/GPU/AMDGPU", + "value": 5555575, + "unit": "ns", + "extra": "gctime=0\nmemory=27072\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/2 thread(s)", + "value": 19000, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/4 thread(s)", + "value": 15250, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/8 thread(s)", + "value": 17333, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/CPU/1 thread(s)", + "value": 15833.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/CUDA", + "value": 19992, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/Metal", + "value": 220583, + "unit": "ns", + "extra": "gctime=0\nmemory=7176\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/forward/GPU/AMDGPU", + "value": 26010, + "unit": "ns", + "extra": "gctime=0\nmemory=1712\nallocs=59\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 11083, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 7874.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 17209, + "unit": "ns", + "extra": "gctime=0\nmemory=50496\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/CUDA", + "value": 289103, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/Metal", + "value": 1568584, + "unit": "ns", + "extra": "gctime=0\nmemory=56552\nallocs=1881\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=tanh)(32 x 128)/zygote/GPU/AMDGPU", + "value": 152592, + "unit": "ns", + "extra": "gctime=0\nmemory=15736\nallocs=347\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 9000, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8625, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10541, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 139112, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 697750, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 238993, + "unit": "ns", + "extra": "gctime=0\nmemory=18992\nallocs=530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9833, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10083, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 9166.5, + "unit": "ns", + "extra": "gctime=0\nmemory=22128\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 687570, + "unit": "ns", + "extra": "gctime=0\nmemory=59624\nallocs=1673\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5031875, + "unit": "ns", + "extra": "gctime=0\nmemory=177168\nallocs=5408\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 662346.5, + "unit": "ns", + "extra": "gctime=0\nmemory=62456\nallocs=1242\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9917, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9541, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10542, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 8749.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 134423, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 923917, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 70081, + "unit": "ns", + "extra": "gctime=0\nmemory=15568\nallocs=341\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 13458, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14542, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 18166, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 13353.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44512\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 641312, + "unit": "ns", + "extra": "gctime=0\nmemory=52312\nallocs=1515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4618500, + "unit": "ns", + "extra": "gctime=0\nmemory=151056\nallocs=4562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 358499, + "unit": "ns", + "extra": "gctime=0\nmemory=54296\nallocs=1083\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 458, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 34342, + "unit": "ns", + "extra": "gctime=0\nmemory=4848\nallocs=166\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 431187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 207942, + "unit": "ns", + "extra": "gctime=0\nmemory=9520\nallocs=363\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17936\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 9167, + "unit": "ns", + "extra": "gctime=0\nmemory=17936\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=17936\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7958, + "unit": "ns", + "extra": "gctime=0\nmemory=17936\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 228270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21368\nallocs=821\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4859333, + "unit": "ns", + "extra": "gctime=0\nmemory=159360\nallocs=5274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 663502, + "unit": "ns", + "extra": "gctime=0\nmemory=48104\nallocs=1199\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 16417, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 14500, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 14625, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 10688, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 21203, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 211000, + "unit": "ns", + "extra": "gctime=0\nmemory=7176\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 190002, + "unit": "ns", + "extra": "gctime=0\nmemory=8272\nallocs=327\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 32021, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 32333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 32000, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 31958, + "unit": "ns", + "extra": "gctime=0\nmemory=67136\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 306523.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19208\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 1710500, + "unit": "ns", + "extra": "gctime=0\nmemory=62696\nallocs=2079\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(32, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 607906, + "unit": "ns", + "extra": "gctime=0\nmemory=19768\nallocs=550\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 440750, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 447459, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 462000, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 440958, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 195262, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1977271, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 352533, + "unit": "ns", + "extra": "gctime=0\nmemory=30112\nallocs=739\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3835270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3836854, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3833000, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3792291, + "unit": "ns", + "extra": "gctime=0\nmemory=8404848\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 575749.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56296\nallocs=1601\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9765667, + "unit": "ns", + "extra": "gctime=0\nmemory=187400\nallocs=5572\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1392133, + "unit": "ns", + "extra": "gctime=0\nmemory=80344\nallocs=1826\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/2 thread(s)", + "value": 832773604.5, + "unit": "ns", + "extra": "gctime=49778812.5\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/4 thread(s)", + "value": 416178167, + "unit": "ns", + "extra": "gctime=415666\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/8 thread(s)", + "value": 542118041, + "unit": "ns", + "extra": "gctime=437292\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/CPU/1 thread(s)", + "value": 1579553104.5, + "unit": "ns", + "extra": "gctime=49996770.5\nmemory=536870976\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/CUDA", + "value": 22761017, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/forward/GPU/AMDGPU", + "value": 14155420, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 3970317583, + "unit": "ns", + "extra": "gctime=128399168\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 1513852416, + "unit": "ns", + "extra": "gctime=36266459\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 1789856541, + "unit": "ns", + "extra": "gctime=30905416\nmemory=2147484752\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 5283712875, + "unit": "ns", + "extra": "gctime=44651958\nmemory=2147484560\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/CUDA", + "value": 365579607, + "unit": "ns", + "extra": "gctime=295145420\nmemory=12048\nallocs=437\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=512)/zygote/GPU/AMDGPU", + "value": 88621175, + "unit": "ns", + "extra": "gctime=0\nmemory=65536\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 76438, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 77000, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 79229, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 76708, + "unit": "ns", + "extra": "gctime=0\nmemory=394480\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 229997.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 543458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49976\nallocs=1562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 110081, + "unit": "ns", + "extra": "gctime=0\nmemory=17840\nallocs=367\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 260208, + "unit": "ns", + "extra": "gctime=0\nmemory=2246000\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 203667, + "unit": "ns", + "extra": "gctime=0\nmemory=2246000\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 196833, + "unit": "ns", + "extra": "gctime=0\nmemory=2246000\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 191729, + "unit": "ns", + "extra": "gctime=0\nmemory=2246000\nallocs=143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1114396, + "unit": "ns", + "extra": "gctime=0\nmemory=87408\nallocs=3262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6038000, + "unit": "ns", + "extra": "gctime=0\nmemory=274440\nallocs=8464\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 642801.5, + "unit": "ns", + "extra": "gctime=0\nmemory=73792\nallocs=1514\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/2 thread(s)", + "value": 200090500, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/4 thread(s)", + "value": 103769271, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/8 thread(s)", + "value": 138991458, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/CPU/1 thread(s)", + "value": 388799084, + "unit": "ns", + "extra": "gctime=0\nmemory=134217792\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/CUDA", + "value": 5837129, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/forward/GPU/AMDGPU", + "value": 3417993, + "unit": "ns", + "extra": "gctime=0\nmemory=11584\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 619060979.5, + "unit": "ns", + "extra": "gctime=2591062.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 352556416.5, + "unit": "ns", + "extra": "gctime=2853291.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 439711625, + "unit": "ns", + "extra": "gctime=2695687.5\nmemory=536872016\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 1186861083, + "unit": "ns", + "extra": "gctime=3098000\nmemory=536871824\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/CUDA", + "value": 26566336, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(512, Bsize=128)/zygote/GPU/AMDGPU", + "value": 21956286, + "unit": "ns", + "extra": "gctime=0\nmemory=37312\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7458, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5500, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6125, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10417, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 27251, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 562916, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 50400, + "unit": "ns", + "extra": "gctime=0\nmemory=6544\nallocs=164\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 213500, + "unit": "ns", + "extra": "gctime=0\nmemory=801936\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 222020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=801936\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 223812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=801936\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 207000, + "unit": "ns", + "extra": "gctime=0\nmemory=801936\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 236231, + "unit": "ns", + "extra": "gctime=0\nmemory=22144\nallocs=649\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 9332834, + "unit": "ns", + "extra": "gctime=0\nmemory=240568\nallocs=7784\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 535116, + "unit": "ns", + "extra": "gctime=0\nmemory=77808\nallocs=1780\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 7937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8542, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10209, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8063, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 131595.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 853687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 76050, + "unit": "ns", + "extra": "gctime=0\nmemory=15552\nallocs=340\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7458, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8792, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 11791, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=18224\nallocs=90\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 567254, + "unit": "ns", + "extra": "gctime=0\nmemory=49832\nallocs=1430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4301958, + "unit": "ns", + "extra": "gctime=0\nmemory=148704\nallocs=4570\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 325113, + "unit": "ns", + "extra": "gctime=0\nmemory=51160\nallocs=1002\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 417, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 459, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 25819, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 434312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 49671, + "unit": "ns", + "extra": "gctime=0\nmemory=6816\nallocs=184\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 10312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40912\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10083, + "unit": "ns", + "extra": "gctime=0\nmemory=40912\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10750, + "unit": "ns", + "extra": "gctime=0\nmemory=40912\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9750, + "unit": "ns", + "extra": "gctime=0\nmemory=40912\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 265547, + "unit": "ns", + "extra": "gctime=0\nmemory=22304\nallocs=678\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5398313, + "unit": "ns", + "extra": "gctime=0\nmemory=174672\nallocs=5728\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 396234, + "unit": "ns", + "extra": "gctime=0\nmemory=55536\nallocs=1354\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 109687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 84812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 99979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 146708, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/CUDA", + "value": 24233, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/Metal", + "value": 264270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/forward/GPU/AMDGPU", + "value": 190652, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=190\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 514729, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 478375, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 478291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 477791, + "unit": "ns", + "extra": "gctime=0\nmemory=1051456\nallocs=18\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/CUDA", + "value": 248436.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19208\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/Metal", + "value": 2180959, + "unit": "ns", + "extra": "gctime=0\nmemory=70856\nallocs=2355\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=gelu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 625596.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20008\nallocs=558\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/2 thread(s)", + "value": 5312, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/4 thread(s)", + "value": 6666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/8 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/CPU/1 thread(s)", + "value": 4250, + "unit": "ns", + "extra": "gctime=0\nmemory=32832\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/CUDA", + "value": 15903, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/forward/GPU/AMDGPU", + "value": 73410, + "unit": "ns", + "extra": "gctime=0\nmemory=2832\nallocs=84\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 11604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 10333, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 10833, + "unit": "ns", + "extra": "gctime=0\nmemory=132176\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 16375, + "unit": "ns", + "extra": "gctime=0\nmemory=131984\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/CUDA", + "value": 226653, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=32)/zygote/GPU/AMDGPU", + "value": 376134, + "unit": "ns", + "extra": "gctime=0\nmemory=17328\nallocs=402\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/2 thread(s)", + "value": 39791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/4 thread(s)", + "value": 51167, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/8 thread(s)", + "value": 52125, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/CPU/1 thread(s)", + "value": 13416, + "unit": "ns", + "extra": "gctime=0\nmemory=131136\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/CUDA", + "value": 21786, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/forward/GPU/AMDGPU", + "value": 85321, + "unit": "ns", + "extra": "gctime=0\nmemory=5104\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 36271, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 27895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 31083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=525392\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 57167, + "unit": "ns", + "extra": "gctime=0\nmemory=525200\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/CUDA", + "value": 202344, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=128)/zygote/GPU/AMDGPU", + "value": 421284, + "unit": "ns", + "extra": "gctime=0\nmemory=24016\nallocs=397\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1958, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1917, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/8 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1792, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/CUDA", + "value": 20196, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/Metal", + "value": 299167, + "unit": "ns", + "extra": "gctime=0\nmemory=6896\nallocs=232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/forward/GPU/AMDGPU", + "value": 28950, + "unit": "ns", + "extra": "gctime=0\nmemory=1776\nallocs=63\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 2250, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 2292, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 2417, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/CUDA", + "value": 217371, + "unit": "ns", + "extra": "gctime=0\nmemory=14616\nallocs=562\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/Metal", + "value": 1508312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=54968\nallocs=1860\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=tanh)(2 x 128)/zygote/GPU/AMDGPU", + "value": 138781, + "unit": "ns", + "extra": "gctime=0\nmemory=13512\nallocs=323\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5542, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 4771, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6458, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 3979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 157473.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 631417, + "unit": "ns", + "extra": "gctime=0\nmemory=37632\nallocs=1263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 61190, + "unit": "ns", + "extra": "gctime=0\nmemory=10912\nallocs=271\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8167, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8459, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8521, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8458, + "unit": "ns", + "extra": "gctime=0\nmemory=15936\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 937391.5, + "unit": "ns", + "extra": "gctime=0\nmemory=72896\nallocs=2910\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5777041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=231432\nallocs=7696\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 389174, + "unit": "ns", + "extra": "gctime=0\nmemory=54352\nallocs=1173\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 57250, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 57000, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 57833, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 58417, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 37214, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 547062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 208472, + "unit": "ns", + "extra": "gctime=0\nmemory=12816\nallocs=428\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 459083, + "unit": "ns", + "extra": "gctime=0\nmemory=1063248\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 465000, + "unit": "ns", + "extra": "gctime=0\nmemory=1063248\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 465625, + "unit": "ns", + "extra": "gctime=0\nmemory=1063248\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 433750, + "unit": "ns", + "extra": "gctime=0\nmemory=1063248\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 272599, + "unit": "ns", + "extra": "gctime=0\nmemory=23080\nallocs=861\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8129896, + "unit": "ns", + "extra": "gctime=0\nmemory=211704\nallocs=6760\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 806418, + "unit": "ns", + "extra": "gctime=0\nmemory=77896\nallocs=1676\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/2 thread(s)", + "value": 3328417, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/4 thread(s)", + "value": 1771625, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/8 thread(s)", + "value": 2335146.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/CPU/1 thread(s)", + "value": 6300271, + "unit": "ns", + "extra": "gctime=0\nmemory=8388672\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/CUDA", + "value": 204810, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/forward/GPU/AMDGPU", + "value": 203992, + "unit": "ns", + "extra": "gctime=0\nmemory=6704\nallocs=185\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 11439854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 6573354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 8339250, + "unit": "ns", + "extra": "gctime=0\nmemory=33555536\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 21058146, + "unit": "ns", + "extra": "gctime=0\nmemory=33555344\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/CUDA", + "value": 743384, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=128)/zygote/GPU/AMDGPU", + "value": 1068820.5, + "unit": "ns", + "extra": "gctime=0\nmemory=28944\nallocs=572\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 5167, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5042, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6167, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 4500, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 151540.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 722604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=36672\nallocs=1235\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 56651, + "unit": "ns", + "extra": "gctime=0\nmemory=10816\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12128\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=12128\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7334, + "unit": "ns", + "extra": "gctime=0\nmemory=12128\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=12128\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 791278.5, + "unit": "ns", + "extra": "gctime=0\nmemory=67568\nallocs=2685\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5188146, + "unit": "ns", + "extra": "gctime=0\nmemory=216304\nallocs=7156\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 384994, + "unit": "ns", + "extra": "gctime=0\nmemory=49584\nallocs=1088\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 97604, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 95667, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 128792, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 101375, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 158164, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 2055833, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 182992, + "unit": "ns", + "extra": "gctime=0\nmemory=24480\nallocs=504\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2022875, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2032584, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2028770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2017771, + "unit": "ns", + "extra": "gctime=0\nmemory=7359616\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 765556, + "unit": "ns", + "extra": "gctime=0\nmemory=67176\nallocs=1814\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11016875, + "unit": "ns", + "extra": "gctime=0\nmemory=227400\nallocs=6857\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1122166, + "unit": "ns", + "extra": "gctime=0\nmemory=92760\nallocs=1890\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/2 thread(s)", + "value": 34604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/4 thread(s)", + "value": 34646, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/8 thread(s)", + "value": 34625, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/CPU/1 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=128\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/CUDA", + "value": 15150, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/forward/GPU/AMDGPU", + "value": 81540, + "unit": "ns", + "extra": "gctime=0\nmemory=2368\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 2562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 3042, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 3000, + "unit": "ns", + "extra": "gctime=0\nmemory=1360\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 2084, + "unit": "ns", + "extra": "gctime=0\nmemory=1168\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/CUDA", + "value": 143347.5, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=4)/zygote/GPU/AMDGPU", + "value": 346678.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12032\nallocs=335\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7166, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 6083, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10000, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 36135, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 551291, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 49200, + "unit": "ns", + "extra": "gctime=0\nmemory=5936\nallocs=136\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 223083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=800976\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 221187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=800976\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 221792, + "unit": "ns", + "extra": "gctime=0\nmemory=800976\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 242208, + "unit": "ns", + "extra": "gctime=0\nmemory=800976\nallocs=111\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 248462, + "unit": "ns", + "extra": "gctime=0\nmemory=19648\nallocs=695\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8070728.5, + "unit": "ns", + "extra": "gctime=0\nmemory=202400\nallocs=6495\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=identity, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 522185, + "unit": "ns", + "extra": "gctime=0\nmemory=74352\nallocs=1508\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 4000, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 21588, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/Metal", + "value": 246125, + "unit": "ns", + "extra": "gctime=0\nmemory=7352\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 46230, + "unit": "ns", + "extra": "gctime=0\nmemory=1920\nallocs=75\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 14916, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=54640\nallocs=9\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 332170, + "unit": "ns", + "extra": "gctime=0\nmemory=15424\nallocs=541\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/Metal", + "value": 1019208, + "unit": "ns", + "extra": "gctime=0\nmemory=54272\nallocs=1837\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 196351.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16048\nallocs=383\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 120916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 127979, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 128958, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 117208, + "unit": "ns", + "extra": "gctime=0\nmemory=1052048\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 151353, + "unit": "ns", + "extra": "gctime=0\nmemory=13392\nallocs=309\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1994500, + "unit": "ns", + "extra": "gctime=0\nmemory=53696\nallocs=1710\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 169312, + "unit": "ns", + "extra": "gctime=0\nmemory=24784\nallocs=523\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1914625, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1908250, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1923666, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1920583, + "unit": "ns", + "extra": "gctime=0\nmemory=6310928\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 750665, + "unit": "ns", + "extra": "gctime=0\nmemory=65584\nallocs=1732\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10544021, + "unit": "ns", + "extra": "gctime=0\nmemory=219904\nallocs=6637\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1234013, + "unit": "ns", + "extra": "gctime=0\nmemory=96032\nallocs=1998\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 18833, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 18292, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 20291, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 18125, + "unit": "ns", + "extra": "gctime=0\nmemory=134288\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 121910, + "unit": "ns", + "extra": "gctime=0\nmemory=12544\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1357604, + "unit": "ns", + "extra": "gctime=0\nmemory=34288\nallocs=1053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 78121, + "unit": "ns", + "extra": "gctime=0\nmemory=16144\nallocs=302\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 229792, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 226249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 217104.5, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 216500, + "unit": "ns", + "extra": "gctime=0\nmemory=933632\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 554792, + "unit": "ns", + "extra": "gctime=0\nmemory=54424\nallocs=1515\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6233583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=167336\nallocs=4921\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 487889, + "unit": "ns", + "extra": "gctime=0\nmemory=57784\nallocs=1133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/2 thread(s)", + "value": 26458.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/4 thread(s)", + "value": 29375, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/8 thread(s)", + "value": 28895.5, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/CPU/1 thread(s)", + "value": 1209, + "unit": "ns", + "extra": "gctime=0\nmemory=4352\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/CUDA", + "value": 16098, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/forward/GPU/AMDGPU", + "value": 83481, + "unit": "ns", + "extra": "gctime=0\nmemory=2368\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 4541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 5083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 5312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18256\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 4625, + "unit": "ns", + "extra": "gctime=0\nmemory=18064\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/CUDA", + "value": 220627, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(16, Bsize=4)/zygote/GPU/AMDGPU", + "value": 386633.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15360\nallocs=414\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 308417, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 309792, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 307375, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 304375, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 251986, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1131042, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 279722, + "unit": "ns", + "extra": "gctime=0\nmemory=24752\nallocs=661\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 542542, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 530625, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 532167, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 552625, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1165833, + "unit": "ns", + "extra": "gctime=0\nmemory=85904\nallocs=3212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6317687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=268496\nallocs=8303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 872538, + "unit": "ns", + "extra": "gctime=0\nmemory=70688\nallocs=1474\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 22083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 23292, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 22583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 19833, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 127624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1486562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 76640, + "unit": "ns", + "extra": "gctime=0\nmemory=16864\nallocs=322\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 226271, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 212542, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 214229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 212500, + "unit": "ns", + "extra": "gctime=0\nmemory=805680\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 857148, + "unit": "ns", + "extra": "gctime=0\nmemory=65552\nallocs=1730\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7438083, + "unit": "ns", + "extra": "gctime=0\nmemory=202848\nallocs=6053\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=identity, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 550605, + "unit": "ns", + "extra": "gctime=0\nmemory=73344\nallocs=1429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 7417, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 6583, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 8000, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 5958, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 153413, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 792104, + "unit": "ns", + "extra": "gctime=0\nmemory=36872\nallocs=1232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 69611, + "unit": "ns", + "extra": "gctime=0\nmemory=10864\nallocs=268\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 10209, + "unit": "ns", + "extra": "gctime=0\nmemory=42336\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10729, + "unit": "ns", + "extra": "gctime=0\nmemory=42336\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10709, + "unit": "ns", + "extra": "gctime=0\nmemory=42336\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10917, + "unit": "ns", + "extra": "gctime=0\nmemory=42336\nallocs=85\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 866679, + "unit": "ns", + "extra": "gctime=0\nmemory=69792\nallocs=2754\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5386000, + "unit": "ns", + "extra": "gctime=0\nmemory=218432\nallocs=7134\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 393064, + "unit": "ns", + "extra": "gctime=0\nmemory=52160\nallocs=1135\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 6708, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 5312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 6646, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5042, + "unit": "ns", + "extra": "gctime=0\nmemory=1632\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 156782.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 767937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=36672\nallocs=1235\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 59941, + "unit": "ns", + "extra": "gctime=0\nmemory=10928\nallocs=272\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=13360\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7334, + "unit": "ns", + "extra": "gctime=0\nmemory=13360\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13360\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7542, + "unit": "ns", + "extra": "gctime=0\nmemory=13360\nallocs=87\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 815076, + "unit": "ns", + "extra": "gctime=0\nmemory=70624\nallocs=2816\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 5485417, + "unit": "ns", + "extra": "gctime=0\nmemory=228896\nallocs=7571\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 400604, + "unit": "ns", + "extra": "gctime=0\nmemory=51264\nallocs=1122\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/2 thread(s)", + "value": 14539750, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/4 thread(s)", + "value": 7703625, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/8 thread(s)", + "value": 10129458, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/CPU/1 thread(s)", + "value": 27745125, + "unit": "ns", + "extra": "gctime=0\nmemory=33554496\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/CUDA", + "value": 531314, + "unit": "ns", + "extra": "gctime=0\nmemory=528\nallocs=24\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/forward/GPU/AMDGPU", + "value": 395439, + "unit": "ns", + "extra": "gctime=0\nmemory=20992\nallocs=348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 46281666, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 26486646, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 33555291, + "unit": "ns", + "extra": "gctime=0\nmemory=134218832\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 85896125, + "unit": "ns", + "extra": "gctime=0\nmemory=134218640\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/CUDA", + "value": 2671865, + "unit": "ns", + "extra": "gctime=0\nmemory=11840\nallocs=432\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=512)/zygote/GPU/AMDGPU", + "value": 3267562, + "unit": "ns", + "extra": "gctime=0\nmemory=62064\nallocs=736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 69667, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 66916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 67667, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 67000, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 134478.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1454896, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 228332, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=387\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 477062, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 441208, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 441375, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 440792, + "unit": "ns", + "extra": "gctime=0\nmemory=1068048\nallocs=123\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 789089, + "unit": "ns", + "extra": "gctime=0\nmemory=68984\nallocs=1896\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7688417, + "unit": "ns", + "extra": "gctime=0\nmemory=212712\nallocs=6323\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 805468, + "unit": "ns", + "extra": "gctime=0\nmemory=76920\nallocs=1597\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 584, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 542, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 32040, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 444854, + "unit": "ns", + "extra": "gctime=0\nmemory=13656\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 49641, + "unit": "ns", + "extra": "gctime=0\nmemory=6496\nallocs=174\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 9834, + "unit": "ns", + "extra": "gctime=0\nmemory=44800\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 10687, + "unit": "ns", + "extra": "gctime=0\nmemory=44800\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 11083.5, + "unit": "ns", + "extra": "gctime=0\nmemory=44800\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 9042, + "unit": "ns", + "extra": "gctime=0\nmemory=44800\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 291258, + "unit": "ns", + "extra": "gctime=0\nmemory=21768\nallocs=806\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4894917, + "unit": "ns", + "extra": "gctime=0\nmemory=159704\nallocs=5216\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 392374, + "unit": "ns", + "extra": "gctime=0\nmemory=49992\nallocs=1211\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 9916, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 9791, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 9792, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 22712, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 227500, + "unit": "ns", + "extra": "gctime=0\nmemory=7384\nallocs=263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 217302, + "unit": "ns", + "extra": "gctime=0\nmemory=8240\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 45958, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 46083, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 46417, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 46042, + "unit": "ns", + "extra": "gctime=0\nmemory=88640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 299669, + "unit": "ns", + "extra": "gctime=0\nmemory=18088\nallocs=683\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 956875, + "unit": "ns", + "extra": "gctime=0\nmemory=50896\nallocs=1672\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 627331.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15992\nallocs=472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 56459, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 56459, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 57042, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 57791, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 29020, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 571042, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 206262, + "unit": "ns", + "extra": "gctime=0\nmemory=13008\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 499291, + "unit": "ns", + "extra": "gctime=0\nmemory=1064208\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 466208, + "unit": "ns", + "extra": "gctime=0\nmemory=1064208\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 466374.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1064208\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 483250, + "unit": "ns", + "extra": "gctime=0\nmemory=1064208\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 250980, + "unit": "ns", + "extra": "gctime=0\nmemory=21224\nallocs=795\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 9451750, + "unit": "ns", + "extra": "gctime=0\nmemory=249920\nallocs=8052\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 850639, + "unit": "ns", + "extra": "gctime=0\nmemory=80120\nallocs=1871\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 659313, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 657750, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 664375, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 613687.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 221566, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1372416, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 238043, + "unit": "ns", + "extra": "gctime=0\nmemory=18976\nallocs=438\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2210125, + "unit": "ns", + "extra": "gctime=0\nmemory=17900528\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2247917, + "unit": "ns", + "extra": "gctime=0\nmemory=17900528\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2238417, + "unit": "ns", + "extra": "gctime=0\nmemory=17900528\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2232375, + "unit": "ns", + "extra": "gctime=0\nmemory=17900528\nallocs=145\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 1036904, + "unit": "ns", + "extra": "gctime=0\nmemory=87408\nallocs=3262\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 8492166, + "unit": "ns", + "extra": "gctime=0\nmemory=298448\nallocs=9286\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=identity, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1379048.5, + "unit": "ns", + "extra": "gctime=0\nmemory=78144\nallocs=1626\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 20604.5, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 20500, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 21375, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 21291, + "unit": "ns", + "extra": "gctime=0\nmemory=134544\nallocs=43\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 125525.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13376\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1450958, + "unit": "ns", + "extra": "gctime=0\nmemory=37008\nallocs=1149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 76440.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16864\nallocs=322\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 233020.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 255209, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 220583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 224854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=936864\nallocs=120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 823179, + "unit": "ns", + "extra": "gctime=0\nmemory=67144\nallocs=1812\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 7635209, + "unit": "ns", + "extra": "gctime=0\nmemory=210344\nallocs=6273\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=true)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 561210.5, + "unit": "ns", + "extra": "gctime=0\nmemory=74232\nallocs=1434\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 22882, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 439812, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 50421, + "unit": "ns", + "extra": "gctime=0\nmemory=6880\nallocs=188\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 10604, + "unit": "ns", + "extra": "gctime=0\nmemory=45264\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 11416, + "unit": "ns", + "extra": "gctime=0\nmemory=45264\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 10791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=45264\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 10375, + "unit": "ns", + "extra": "gctime=0\nmemory=45264\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 270980, + "unit": "ns", + "extra": "gctime=0\nmemory=19960\nallocs=740\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5356520.5, + "unit": "ns", + "extra": "gctime=0\nmemory=181672\nallocs=5941\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 411824, + "unit": "ns", + "extra": "gctime=0\nmemory=56360\nallocs=1361\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 10792, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 10250, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 10167, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=3360\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 134053, + "unit": "ns", + "extra": "gctime=0\nmemory=11536\nallocs=274\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 834416, + "unit": "ns", + "extra": "gctime=0\nmemory=32720\nallocs=1044\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 67971, + "unit": "ns", + "extra": "gctime=0\nmemory=15040\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7542, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7792, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8042, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=17552\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 543489, + "unit": "ns", + "extra": "gctime=0\nmemory=48240\nallocs=1348\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4166125, + "unit": "ns", + "extra": "gctime=0\nmemory=141496\nallocs=4352\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 329703, + "unit": "ns", + "extra": "gctime=0\nmemory=50112\nallocs=982\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 1792, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 1625, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 1834, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 1542, + "unit": "ns", + "extra": "gctime=0\nmemory=1280\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 20840, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 300875, + "unit": "ns", + "extra": "gctime=0\nmemory=6896\nallocs=232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 192452, + "unit": "ns", + "extra": "gctime=0\nmemory=4912\nallocs=259\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 3375, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 3500, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 3583, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 3166, + "unit": "ns", + "extra": "gctime=0\nmemory=4800\nallocs=14\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 231108, + "unit": "ns", + "extra": "gctime=0\nmemory=16984\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 1624666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=60832\nallocs=2056\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(2, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 597176, + "unit": "ns", + "extra": "gctime=0\nmemory=16648\nallocs=470\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/2 thread(s)", + "value": 151791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/4 thread(s)", + "value": 106625, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/8 thread(s)", + "value": 129417, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/CPU/1 thread(s)", + "value": 225312, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/CUDA", + "value": 23800, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/Metal", + "value": 298375, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/forward/GPU/AMDGPU", + "value": 36681, + "unit": "ns", + "extra": "gctime=0\nmemory=1232\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 176833, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 87833, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 110583, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 251000, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/CUDA", + "value": 234463.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/Metal", + "value": 1991770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=64072\nallocs=2140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=tanh)(512 x 128)/zygote/GPU/AMDGPU", + "value": 221652, + "unit": "ns", + "extra": "gctime=0\nmemory=15976\nallocs=362\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 7291, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 5416, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 5959, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 10167, + "unit": "ns", + "extra": "gctime=0\nmemory=132096\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 32586, + "unit": "ns", + "extra": "gctime=0\nmemory=3712\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 559708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 50471, + "unit": "ns", + "extra": "gctime=0\nmemory=6016\nallocs=141\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 223250, + "unit": "ns", + "extra": "gctime=0\nmemory=932112\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 265500, + "unit": "ns", + "extra": "gctime=0\nmemory=932112\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 229334, + "unit": "ns", + "extra": "gctime=0\nmemory=932112\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 213562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=932112\nallocs=113\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 264008.5, + "unit": "ns", + "extra": "gctime=0\nmemory=21240\nallocs=777\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 8366500, + "unit": "ns", + "extra": "gctime=0\nmemory=209488\nallocs=6713\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 536235, + "unit": "ns", + "extra": "gctime=0\nmemory=75080\nallocs=1509\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 15417, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 14917, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 16667, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 15750, + "unit": "ns", + "extra": "gctime=0\nmemory=5408\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 153470.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11984\nallocs=467\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 758125, + "unit": "ns", + "extra": "gctime=0\nmemory=36872\nallocs=1232\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 239772, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=532\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 23958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=52896\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 24417, + "unit": "ns", + "extra": "gctime=0\nmemory=52896\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 23291, + "unit": "ns", + "extra": "gctime=0\nmemory=52896\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 22833, + "unit": "ns", + "extra": "gctime=0\nmemory=52896\nallocs=102\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 908275, + "unit": "ns", + "extra": "gctime=0\nmemory=76048\nallocs=2955\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 5574500, + "unit": "ns", + "extra": "gctime=0\nmemory=233472\nallocs=7569\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 696222.5, + "unit": "ns", + "extra": "gctime=0\nmemory=56624\nallocs=1212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 9625, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9792, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 10875, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9625.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7344\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 138347.5, + "unit": "ns", + "extra": "gctime=0\nmemory=12496\nallocs=308\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 828291, + "unit": "ns", + "extra": "gctime=0\nmemory=35736\nallocs=1143\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 71601, + "unit": "ns", + "extra": "gctime=0\nmemory=16064\nallocs=347\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 14333, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 14625, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 14354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 13875, + "unit": "ns", + "extra": "gctime=0\nmemory=43568\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 750912, + "unit": "ns", + "extra": "gctime=0\nmemory=58672\nallocs=1592\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5349500, + "unit": "ns", + "extra": "gctime=0\nmemory=170152\nallocs=5129\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 372023.5, + "unit": "ns", + "extra": "gctime=0\nmemory=62704\nallocs=1198\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/2 thread(s)", + "value": 10458, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/4 thread(s)", + "value": 9917, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/8 thread(s)", + "value": 11291, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/CPU/1 thread(s)", + "value": 9958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7088\nallocs=32\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/CUDA", + "value": 136667, + "unit": "ns", + "extra": "gctime=0\nmemory=11664\nallocs=282\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/Metal", + "value": 897645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=33016\nallocs=1047\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/forward/GPU/AMDGPU", + "value": 70471, + "unit": "ns", + "extra": "gctime=0\nmemory=15360\nallocs=328\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 12708, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 12958, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 12834, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 12249.5, + "unit": "ns", + "extra": "gctime=0\nmemory=40112\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/CUDA", + "value": 610096, + "unit": "ns", + "extra": "gctime=0\nmemory=50720\nallocs=1433\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/Metal", + "value": 4478395.5, + "unit": "ns", + "extra": "gctime=0\nmemory=143648\nallocs=4347\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=false)(32 x 32)/zygote/GPU/AMDGPU", + "value": 343163, + "unit": "ns", + "extra": "gctime=0\nmemory=53136\nallocs=1056\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/2 thread(s)", + "value": 31500, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/4 thread(s)", + "value": 32334, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/8 thread(s)", + "value": 31063, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/CPU/1 thread(s)", + "value": 1667, + "unit": "ns", + "extra": "gctime=0\nmemory=2304\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/CUDA", + "value": 16053, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/forward/GPU/AMDGPU", + "value": 74040, + "unit": "ns", + "extra": "gctime=0\nmemory=5120\nallocs=86\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/2 thread(s)", + "value": 5104, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/4 thread(s)", + "value": 5000, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/8 thread(s)", + "value": 5291, + "unit": "ns", + "extra": "gctime=0\nmemory=10064\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/CPU/1 thread(s)", + "value": 6521, + "unit": "ns", + "extra": "gctime=0\nmemory=9872\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/CUDA", + "value": 143200, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=128)/zygote/GPU/AMDGPU", + "value": 371674, + "unit": "ns", + "extra": "gctime=0\nmemory=24192\nallocs=408\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 250, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 25340, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 455083, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 48400, + "unit": "ns", + "extra": "gctime=0\nmemory=6736\nallocs=179\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7000, + "unit": "ns", + "extra": "gctime=0\nmemory=17024\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7500, + "unit": "ns", + "extra": "gctime=0\nmemory=17024\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7792, + "unit": "ns", + "extra": "gctime=0\nmemory=17024\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17024\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 193241, + "unit": "ns", + "extra": "gctime=0\nmemory=20080\nallocs=609\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5200291, + "unit": "ns", + "extra": "gctime=0\nmemory=172320\nallocs=5736\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 392484, + "unit": "ns", + "extra": "gctime=0\nmemory=52480\nallocs=1278\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/2 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/4 thread(s)", + "value": 2042, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/8 thread(s)", + "value": 2125, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/CPU/1 thread(s)", + "value": 1959, + "unit": "ns", + "extra": "gctime=0\nmemory=5440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/CUDA", + "value": 26289, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/Metal", + "value": 430104.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15928\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/forward/GPU/AMDGPU", + "value": 209682, + "unit": "ns", + "extra": "gctime=0\nmemory=12960\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/2 thread(s)", + "value": 17604, + "unit": "ns", + "extra": "gctime=0\nmemory=49616\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/4 thread(s)", + "value": 17791.5, + "unit": "ns", + "extra": "gctime=0\nmemory=49616\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/8 thread(s)", + "value": 17646, + "unit": "ns", + "extra": "gctime=0\nmemory=49616\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/CPU/1 thread(s)", + "value": 17396, + "unit": "ns", + "extra": "gctime=0\nmemory=49616\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/CUDA", + "value": 277617, + "unit": "ns", + "extra": "gctime=0\nmemory=21784\nallocs=824\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/Metal", + "value": 5801541, + "unit": "ns", + "extra": "gctime=0\nmemory=183896\nallocs=5989\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(32 x 32)/zygote/GPU/AMDGPU", + "value": 717947, + "unit": "ns", + "extra": "gctime=0\nmemory=57880\nallocs=1441\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 177042, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 174792, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 154291, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 148125, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 216713, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1455354, + "unit": "ns", + "extra": "gctime=0\nmemory=56336\nallocs=1808\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 198202, + "unit": "ns", + "extra": "gctime=0\nmemory=18928\nallocs=435\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 1277542, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 1327937, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 1325666.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1327375, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 968973, + "unit": "ns", + "extra": "gctime=0\nmemory=84848\nallocs=3168\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 6943375, + "unit": "ns", + "extra": "gctime=0\nmemory=293688\nallocs=9154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1113911, + "unit": "ns", + "extra": "gctime=0\nmemory=74880\nallocs=1607\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/2 thread(s)", + "value": 24542, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/4 thread(s)", + "value": 26041, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/8 thread(s)", + "value": 25666, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/CPU/1 thread(s)", + "value": 24771, + "unit": "ns", + "extra": "gctime=0\nmemory=132624\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/CUDA", + "value": 260001, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/Metal", + "value": 1053562.5, + "unit": "ns", + "extra": "gctime=0\nmemory=48184\nallocs=1530\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/forward/GPU/AMDGPU", + "value": 119771, + "unit": "ns", + "extra": "gctime=0\nmemory=18608\nallocs=415\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/2 thread(s)", + "value": 128854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/4 thread(s)", + "value": 173624.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/8 thread(s)", + "value": 118334, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/CPU/1 thread(s)", + "value": 117125, + "unit": "ns", + "extra": "gctime=0\nmemory=1320960\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/CUDA", + "value": 1179575, + "unit": "ns", + "extra": "gctime=0\nmemory=84848\nallocs=3168\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/Metal", + "value": 6306396, + "unit": "ns", + "extra": "gctime=0\nmemory=268592\nallocs=8303\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=false)(16 x 16 x 4 x 32)/zygote/GPU/AMDGPU", + "value": 607237, + "unit": "ns", + "extra": "gctime=0\nmemory=70224\nallocs=1446\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 334, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 334, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 22595, + "unit": "ns", + "extra": "gctime=0\nmemory=3072\nallocs=104\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 442041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 49271, + "unit": "ns", + "extra": "gctime=0\nmemory=6784\nallocs=182\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7458, + "unit": "ns", + "extra": "gctime=0\nmemory=17600\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7667, + "unit": "ns", + "extra": "gctime=0\nmemory=17600\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 7834, + "unit": "ns", + "extra": "gctime=0\nmemory=17600\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17600\nallocs=95\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 198802.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17736\nallocs=671\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5362999.5, + "unit": "ns", + "extra": "gctime=0\nmemory=179120\nallocs=5952\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 396804, + "unit": "ns", + "extra": "gctime=0\nmemory=53640\nallocs=1306\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 7125, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 6500, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 5041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2400\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 164192.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13104\nallocs=537\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 455646, + "unit": "ns", + "extra": "gctime=0\nmemory=37632\nallocs=1263\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 237732, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=532\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 9875, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 10417, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 10208, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 10042, + "unit": "ns", + "extra": "gctime=0\nmemory=17152\nallocs=88\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 959197, + "unit": "ns", + "extra": "gctime=0\nmemory=77024\nallocs=3085\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 6029979, + "unit": "ns", + "extra": "gctime=0\nmemory=243912\nallocs=8109\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 680937, + "unit": "ns", + "extra": "gctime=0\nmemory=56608\nallocs=1219\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/2 thread(s)", + "value": 667, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/4 thread(s)", + "value": 666, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/8 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/CPU/1 thread(s)", + "value": 625, + "unit": "ns", + "extra": "gctime=0\nmemory=1088\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/CUDA", + "value": 22283, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/Metal", + "value": 328083, + "unit": "ns", + "extra": "gctime=0\nmemory=7104\nallocs=261\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/forward/GPU/AMDGPU", + "value": 215392, + "unit": "ns", + "extra": "gctime=0\nmemory=4960\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/2 thread(s)", + "value": 4583, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/4 thread(s)", + "value": 4583, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/8 thread(s)", + "value": 4792, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/CPU/1 thread(s)", + "value": 4583, + "unit": "ns", + "extra": "gctime=0\nmemory=6608\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/CUDA", + "value": 228018.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15864\nallocs=614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/Metal", + "value": 1580083, + "unit": "ns", + "extra": "gctime=0\nmemory=48672\nallocs=1657\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(2, bias=false, act=gelu)(2 x 128)/zygote/GPU/AMDGPU", + "value": 600937, + "unit": "ns", + "extra": "gctime=0\nmemory=11288\nallocs=293\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 8729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 8416, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 9958, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 8041.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3616\nallocs=36\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 136239, + "unit": "ns", + "extra": "gctime=0\nmemory=12368\nallocs=300\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 802291.5, + "unit": "ns", + "extra": "gctime=0\nmemory=35440\nallocs=1140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 70015.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15968\nallocs=341\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8750, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8625, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8959, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=20784\nallocs=100\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 658040, + "unit": "ns", + "extra": "gctime=0\nmemory=56192\nallocs=1507\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 4729875, + "unit": "ns", + "extra": "gctime=0\nmemory=167800\nallocs=5137\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(2, act=identity, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 352293.5, + "unit": "ns", + "extra": "gctime=0\nmemory=59616\nallocs=1120\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/2 thread(s)", + "value": 126416, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/4 thread(s)", + "value": 95833, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/8 thread(s)", + "value": 128834, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/CPU/1 thread(s)", + "value": 182979, + "unit": "ns", + "extra": "gctime=0\nmemory=262208\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/CUDA", + "value": 45872, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/forward/GPU/AMDGPU", + "value": 94981, + "unit": "ns", + "extra": "gctime=0\nmemory=2112\nallocs=84\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/2 thread(s)", + "value": 324875, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/4 thread(s)", + "value": 187937.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/8 thread(s)", + "value": 313667, + "unit": "ns", + "extra": "gctime=0\nmemory=1049680\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/CPU/1 thread(s)", + "value": 567334, + "unit": "ns", + "extra": "gctime=0\nmemory=1049488\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/CUDA", + "value": 202221.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=4)/zygote/GPU/AMDGPU", + "value": 486324.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15088\nallocs=397\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/2 thread(s)", + "value": 397167, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/4 thread(s)", + "value": 215333, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/8 thread(s)", + "value": 288334, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/CPU/1 thread(s)", + "value": 756250, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/CUDA", + "value": 43363, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/Metal", + "value": 434250, + "unit": "ns", + "extra": "gctime=0\nmemory=7392\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/forward/GPU/AMDGPU", + "value": 83831, + "unit": "ns", + "extra": "gctime=0\nmemory=2720\nallocs=125\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1452083, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 860875, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1133979.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2441959, + "unit": "ns", + "extra": "gctime=0\nmemory=1837680\nallocs=13\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/CUDA", + "value": 257428, + "unit": "ns", + "extra": "gctime=0\nmemory=15744\nallocs=561\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/Metal", + "value": 1874458, + "unit": "ns", + "extra": "gctime=0\nmemory=62528\nallocs=2119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=identity)(512 x 128)/zygote/GPU/AMDGPU", + "value": 354814, + "unit": "ns", + "extra": "gctime=0\nmemory=16816\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 653292, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 672583.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 659166, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 630917, + "unit": "ns", + "extra": "gctime=0\nmemory=3146992\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 215216.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19024\nallocs=682\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1367729, + "unit": "ns", + "extra": "gctime=0\nmemory=58128\nallocs=1840\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 250902.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18432\nallocs=404\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2450458, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2465792, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2418771, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 2409583, + "unit": "ns", + "extra": "gctime=0\nmemory=19997968\nallocs=151\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 1051964, + "unit": "ns", + "extra": "gctime=0\nmemory=90592\nallocs=3393\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 12445958, + "unit": "ns", + "extra": "gctime=0\nmemory=313568\nallocs=9730\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=relu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1482025, + "unit": "ns", + "extra": "gctime=0\nmemory=80224\nallocs=1685\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/2 thread(s)", + "value": 33917, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/4 thread(s)", + "value": 34417, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/8 thread(s)", + "value": 34209, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/CPU/1 thread(s)", + "value": 750, + "unit": "ns", + "extra": "gctime=0\nmemory=576\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/CUDA", + "value": 15426, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/forward/GPU/AMDGPU", + "value": 73461, + "unit": "ns", + "extra": "gctime=0\nmemory=2768\nallocs=80\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 3125, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 3333, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 3354.5, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 3125, + "unit": "ns", + "extra": "gctime=0\nmemory=2960\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/CUDA", + "value": 142341, + "unit": "ns", + "extra": "gctime=0\nmemory=9456\nallocs=360\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=32)/zygote/GPU/AMDGPU", + "value": 346283, + "unit": "ns", + "extra": "gctime=0\nmemory=14224\nallocs=337\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 407041, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 402250, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 408917, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 421250, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 43456, + "unit": "ns", + "extra": "gctime=0\nmemory=3536\nallocs=119\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1157209, + "unit": "ns", + "extra": "gctime=0\nmemory=16032\nallocs=533\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 240343, + "unit": "ns", + "extra": "gctime=0\nmemory=13088\nallocs=431\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3870917, + "unit": "ns", + "extra": "gctime=0\nmemory=8405472\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 3997208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8405472\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 3992125, + "unit": "ns", + "extra": "gctime=0\nmemory=8405472\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3774916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8405472\nallocs=117\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 245622, + "unit": "ns", + "extra": "gctime=0\nmemory=21224\nallocs=795\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 11639916.5, + "unit": "ns", + "extra": "gctime=0\nmemory=249920\nallocs=8052\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=true)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1244322, + "unit": "ns", + "extra": "gctime=0\nmemory=85128\nallocs=2037\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/2 thread(s)", + "value": 3917, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/4 thread(s)", + "value": 3959, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/8 thread(s)", + "value": 3916, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/CPU/1 thread(s)", + "value": 3875, + "unit": "ns", + "extra": "gctime=0\nmemory=16640\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/CUDA", + "value": 33107, + "unit": "ns", + "extra": "gctime=0\nmemory=1008\nallocs=38\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/Metal", + "value": 264209, + "unit": "ns", + "extra": "gctime=0\nmemory=2696\nallocs=94\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/forward/GPU/AMDGPU", + "value": 41200, + "unit": "ns", + "extra": "gctime=0\nmemory=1072\nallocs=58\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 15708, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 15792, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 15958, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 15667, + "unit": "ns", + "extra": "gctime=0\nmemory=55216\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/CUDA", + "value": 266369, + "unit": "ns", + "extra": "gctime=0\nmemory=13328\nallocs=472\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/Metal", + "value": 881458, + "unit": "ns", + "extra": "gctime=0\nmemory=37864\nallocs=1258\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=false, act=identity)(32 x 128)/zygote/GPU/AMDGPU", + "value": 166502, + "unit": "ns", + "extra": "gctime=0\nmemory=11776\nallocs=307\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 404166, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 221375, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 295292, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 760167, + "unit": "ns", + "extra": "gctime=0\nmemory=262192\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 113190, + "unit": "ns", + "extra": "gctime=0\nmemory=960\nallocs=41\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 496062.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7424\nallocs=265\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 90120, + "unit": "ns", + "extra": "gctime=0\nmemory=3024\nallocs=144\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1467208, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 886541, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1159125, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2466583, + "unit": "ns", + "extra": "gctime=0\nmemory=2098400\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 240367, + "unit": "ns", + "extra": "gctime=0\nmemory=15992\nallocs=601\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 1925500, + "unit": "ns", + "extra": "gctime=0\nmemory=57384\nallocs=1920\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=false, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 356978.5, + "unit": "ns", + "extra": "gctime=0\nmemory=14984\nallocs=422\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/2 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/4 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/8 thread(s)", + "value": 583, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/CPU/1 thread(s)", + "value": 500, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/CUDA", + "value": 25613, + "unit": "ns", + "extra": "gctime=0\nmemory=3808\nallocs=128\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/Metal", + "value": 450208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=15616\nallocs=529\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/forward/GPU/AMDGPU", + "value": 209312, + "unit": "ns", + "extra": "gctime=0\nmemory=12976\nallocs=427\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 8333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=18176\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 8500, + "unit": "ns", + "extra": "gctime=0\nmemory=18176\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=18176\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 8208, + "unit": "ns", + "extra": "gctime=0\nmemory=18176\nallocs=96\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/CUDA", + "value": 210905.5, + "unit": "ns", + "extra": "gctime=0\nmemory=19560\nallocs=755\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/Metal", + "value": 5731708.5, + "unit": "ns", + "extra": "gctime=0\nmemory=181144\nallocs=6003\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=gelu, affine=true)(4 x 32)/zygote/GPU/AMDGPU", + "value": 707187.5, + "unit": "ns", + "extra": "gctime=0\nmemory=53704\nallocs=1301\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/2 thread(s)", + "value": 834541.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/4 thread(s)", + "value": 468500, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/8 thread(s)", + "value": 621917, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/CPU/1 thread(s)", + "value": 1560125, + "unit": "ns", + "extra": "gctime=0\nmemory=2097216\nallocs=2\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/CUDA", + "value": 134548.5, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/forward/GPU/AMDGPU", + "value": 178651, + "unit": "ns", + "extra": "gctime=0\nmemory=2832\nallocs=84\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/2 thread(s)", + "value": 2687229.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/4 thread(s)", + "value": 1538104.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/8 thread(s)", + "value": 2006000, + "unit": "ns", + "extra": "gctime=0\nmemory=8389712\nallocs=31\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/CPU/1 thread(s)", + "value": 4939208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8389520\nallocs=29\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/CUDA", + "value": 259430, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(128, Bsize=32)/zygote/GPU/AMDGPU", + "value": 775478, + "unit": "ns", + "extra": "gctime=0\nmemory=20368\nallocs=459\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/2 thread(s)", + "value": 334, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/4 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/8 thread(s)", + "value": 375, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/CPU/1 thread(s)", + "value": 292, + "unit": "ns", + "extra": "gctime=0\nmemory=1440\nallocs=19\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/CUDA", + "value": 31695, + "unit": "ns", + "extra": "gctime=0\nmemory=4112\nallocs=142\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/Metal", + "value": 443479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=13344\nallocs=443\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/forward/GPU/AMDGPU", + "value": 48751, + "unit": "ns", + "extra": "gctime=0\nmemory=6496\nallocs=174\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/2 thread(s)", + "value": 7375, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/4 thread(s)", + "value": 7292, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/8 thread(s)", + "value": 8000, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/CPU/1 thread(s)", + "value": 7333.5, + "unit": "ns", + "extra": "gctime=0\nmemory=17360\nallocs=93\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/CUDA", + "value": 221109, + "unit": "ns", + "extra": "gctime=0\nmemory=19544\nallocs=737\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/Metal", + "value": 4885229, + "unit": "ns", + "extra": "gctime=0\nmemory=157352\nallocs=5224\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(2, act=relu, affine=false)(4 x 32)/zygote/GPU/AMDGPU", + "value": 371974, + "unit": "ns", + "extra": "gctime=0\nmemory=47240\nallocs=1154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 2377333, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 2416875, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 2384125, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 2389542, + "unit": "ns", + "extra": "gctime=0\nmemory=1050128\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 214311, + "unit": "ns", + "extra": "gctime=0\nmemory=17904\nallocs=612\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1592375, + "unit": "ns", + "extra": "gctime=0\nmemory=56336\nallocs=1808\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 360673, + "unit": "ns", + "extra": "gctime=0\nmemory=20128\nallocs=510\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 4636270.5, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 4666250, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 4654917, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 4646958, + "unit": "ns", + "extra": "gctime=0\nmemory=10496000\nallocs=133\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 959765, + "unit": "ns", + "extra": "gctime=0\nmemory=85904\nallocs=3212\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 6792729.5, + "unit": "ns", + "extra": "gctime=0\nmemory=293592\nallocs=9154\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "layernorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1254372, + "unit": "ns", + "extra": "gctime=0\nmemory=74512\nallocs=1551\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 7729, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 7021, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 7250, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 9458, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 23256, + "unit": "ns", + "extra": "gctime=0\nmemory=2384\nallocs=107\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 265042, + "unit": "ns", + "extra": "gctime=0\nmemory=7184\nallocs=234\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 38891, + "unit": "ns", + "extra": "gctime=0\nmemory=1184\nallocs=26\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 52312.5, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 52104, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 33229, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 63750, + "unit": "ns", + "extra": "gctime=0\nmemory=789264\nallocs=16\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 233595.5, + "unit": "ns", + "extra": "gctime=0\nmemory=16840\nallocs=631\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 2003500, + "unit": "ns", + "extra": "gctime=0\nmemory=64072\nallocs=2140\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "bias_activation(512, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 225372, + "unit": "ns", + "extra": "gctime=0\nmemory=16056\nallocs=367\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/2 thread(s)", + "value": 21541, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/4 thread(s)", + "value": 24916, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/8 thread(s)", + "value": 24437.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/CPU/1 thread(s)", + "value": 5292, + "unit": "ns", + "extra": "gctime=0\nmemory=8448\nallocs=1\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/CUDA", + "value": 17600, + "unit": "ns", + "extra": "gctime=0\nmemory=512\nallocs=23\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/forward/GPU/AMDGPU", + "value": 86991, + "unit": "ns", + "extra": "gctime=0\nmemory=14768\nallocs=101\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/2 thread(s)", + "value": 11875, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/4 thread(s)", + "value": 9250, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/8 thread(s)", + "value": 10479.5, + "unit": "ns", + "extra": "gctime=0\nmemory=34640\nallocs=27\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/CPU/1 thread(s)", + "value": 17979, + "unit": "ns", + "extra": "gctime=0\nmemory=34448\nallocs=25\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/CUDA", + "value": 242050.5, + "unit": "ns", + "extra": "gctime=0\nmemory=11808\nallocs=430\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchedmm(2, Bsize=512)/zygote/GPU/AMDGPU", + "value": 379203, + "unit": "ns", + "extra": "gctime=0\nmemory=52528\nallocs=415\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/2 thread(s)", + "value": 406209, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/4 thread(s)", + "value": 223645.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/8 thread(s)", + "value": 297291, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/CPU/1 thread(s)", + "value": 762770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=262384\nallocs=6\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/CUDA", + "value": 46561, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/Metal", + "value": 507959, + "unit": "ns", + "extra": "gctime=0\nmemory=8016\nallocs=281\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/forward/GPU/AMDGPU", + "value": 89681, + "unit": "ns", + "extra": "gctime=0\nmemory=3152\nallocs=149\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/2 thread(s)", + "value": 1479041, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/4 thread(s)", + "value": 892812.5, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/8 thread(s)", + "value": 1165500, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/CPU/1 thread(s)", + "value": 2475375, + "unit": "ns", + "extra": "gctime=0\nmemory=2100080\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/CUDA", + "value": 302356, + "unit": "ns", + "extra": "gctime=0\nmemory=18136\nallocs=667\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/Metal", + "value": 2142375, + "unit": "ns", + "extra": "gctime=0\nmemory=70056\nallocs=2351\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(512, bias=true, act=relu)(512 x 128)/zygote/GPU/AMDGPU", + "value": 379453, + "unit": "ns", + "extra": "gctime=0\nmemory=18200\nallocs=465\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 435625, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 429834, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 437541, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 448000, + "unit": "ns", + "extra": "gctime=0\nmemory=1049824\nallocs=20\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 54368, + "unit": "ns", + "extra": "gctime=0\nmemory=4448\nallocs=152\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1064291, + "unit": "ns", + "extra": "gctime=0\nmemory=13760\nallocs=447\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 237093, + "unit": "ns", + "extra": "gctime=0\nmemory=12896\nallocs=429\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 3904000, + "unit": "ns", + "extra": "gctime=0\nmemory=8404288\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 4036083, + "unit": "ns", + "extra": "gctime=0\nmemory=8404288\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 4027208.5, + "unit": "ns", + "extra": "gctime=0\nmemory=8404288\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 3808500, + "unit": "ns", + "extra": "gctime=0\nmemory=8404288\nallocs=115\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 265452, + "unit": "ns", + "extra": "gctime=0\nmemory=23080\nallocs=861\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 10355209, + "unit": "ns", + "extra": "gctime=0\nmemory=211720\nallocs=6761\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "batchnorm(4, act=gelu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1238912.5, + "unit": "ns", + "extra": "gctime=0\nmemory=81272\nallocs=1740\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/2 thread(s)", + "value": 8792, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/4 thread(s)", + "value": 6917, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/8 thread(s)", + "value": 7667, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/CPU/1 thread(s)", + "value": 12375, + "unit": "ns", + "extra": "gctime=0\nmemory=16832\nallocs=5\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/CUDA", + "value": 23715, + "unit": "ns", + "extra": "gctime=0\nmemory=1056\nallocs=47\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/Metal", + "value": 228854.5, + "unit": "ns", + "extra": "gctime=0\nmemory=7976\nallocs=279\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/forward/GPU/AMDGPU", + "value": 217862, + "unit": "ns", + "extra": "gctime=0\nmemory=4720\nallocs=247\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/2 thread(s)", + "value": 45125, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/4 thread(s)", + "value": 45334, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/8 thread(s)", + "value": 45166, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/CPU/1 thread(s)", + "value": 45042, + "unit": "ns", + "extra": "gctime=0\nmemory=87936\nallocs=12\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/CUDA", + "value": 356362, + "unit": "ns", + "extra": "gctime=0\nmemory=20184\nallocs=752\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/Metal", + "value": 1233500, + "unit": "ns", + "extra": "gctime=0\nmemory=67304\nallocs=2251\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "dense(32, bias=true, act=gelu)(32 x 128)/zygote/GPU/AMDGPU", + "value": 675687, + "unit": "ns", + "extra": "gctime=0\nmemory=20520\nallocs=564\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/2 thread(s)", + "value": 85542, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/4 thread(s)", + "value": 123125, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/8 thread(s)", + "value": 86770.5, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/CPU/1 thread(s)", + "value": 83125, + "unit": "ns", + "extra": "gctime=0\nmemory=1051792\nallocs=39\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/CUDA", + "value": 190402, + "unit": "ns", + "extra": "gctime=0\nmemory=12560\nallocs=283\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/Metal", + "value": 1991958.5, + "unit": "ns", + "extra": "gctime=0\nmemory=50976\nallocs=1614\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/forward/GPU/AMDGPU", + "value": 203932, + "unit": "ns", + "extra": "gctime=0\nmemory=23424\nallocs=463\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/2 thread(s)", + "value": 2017750, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/4 thread(s)", + "value": 2018541, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/8 thread(s)", + "value": 2017917, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/CPU/1 thread(s)", + "value": 1986854, + "unit": "ns", + "extra": "gctime=0\nmemory=7356160\nallocs=108\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/CUDA", + "value": 571148, + "unit": "ns", + "extra": "gctime=0\nmemory=54456\nallocs=1517\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/Metal", + "value": 9347417, + "unit": "ns", + "extra": "gctime=0\nmemory=184408\nallocs=5506\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + }, + { + "name": "groupnorm(4, act=relu, affine=false)(16 x 16 x 32 x 32)/zygote/GPU/AMDGPU", + "value": 1095271, + "unit": "ns", + "extra": "gctime=0\nmemory=78616\nallocs=1724\nparams={\"gctrial\":true,\"time_tolerance\":0.05,\"evals_set\":false,\"samples\":10000,\"evals\":1,\"gcsample\":false,\"seconds\":5,\"overhead\":0,\"memory_tolerance\":0.01}" + } + ] } ] }