diff --git a/main/.doctrees/environment.pickle b/main/.doctrees/environment.pickle index b7579ca0cf27..b4c30a2fd4cd 100644 Binary files a/main/.doctrees/environment.pickle and b/main/.doctrees/environment.pickle differ diff --git a/main/.doctrees/getting-started/tutorials/01-vector-add.doctree b/main/.doctrees/getting-started/tutorials/01-vector-add.doctree index eacfe58be5b1..2cffb5abb63f 100644 Binary files a/main/.doctrees/getting-started/tutorials/01-vector-add.doctree and b/main/.doctrees/getting-started/tutorials/01-vector-add.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/02-fused-softmax.doctree b/main/.doctrees/getting-started/tutorials/02-fused-softmax.doctree index 12489e2926cb..4e1193010f25 100644 Binary files a/main/.doctrees/getting-started/tutorials/02-fused-softmax.doctree and b/main/.doctrees/getting-started/tutorials/02-fused-softmax.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree b/main/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree index ad8b4f00bc58..f16951284e06 100644 Binary files a/main/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree and b/main/.doctrees/getting-started/tutorials/03-matrix-multiplication.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree b/main/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree index e9804e4fa1c4..3be148319d7d 100644 Binary files a/main/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree and b/main/.doctrees/getting-started/tutorials/04-low-memory-dropout.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/05-layer-norm.doctree b/main/.doctrees/getting-started/tutorials/05-layer-norm.doctree index f14c98a7d0eb..e7d65618a5a9 100644 Binary files a/main/.doctrees/getting-started/tutorials/05-layer-norm.doctree and b/main/.doctrees/getting-started/tutorials/05-layer-norm.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/06-fused-attention.doctree b/main/.doctrees/getting-started/tutorials/06-fused-attention.doctree index f1c243a5406d..8f4d27313813 100644 Binary files a/main/.doctrees/getting-started/tutorials/06-fused-attention.doctree and b/main/.doctrees/getting-started/tutorials/06-fused-attention.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/07-extern-functions.doctree b/main/.doctrees/getting-started/tutorials/07-extern-functions.doctree index f08173ef15fc..acfd7298381e 100644 Binary files a/main/.doctrees/getting-started/tutorials/07-extern-functions.doctree and b/main/.doctrees/getting-started/tutorials/07-extern-functions.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/08-grouped-gemm.doctree b/main/.doctrees/getting-started/tutorials/08-grouped-gemm.doctree index 0dc1d8bd97a7..2563d9cb13ad 100644 Binary files a/main/.doctrees/getting-started/tutorials/08-grouped-gemm.doctree and b/main/.doctrees/getting-started/tutorials/08-grouped-gemm.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/09-persistent-matmul.doctree b/main/.doctrees/getting-started/tutorials/09-persistent-matmul.doctree index 99a873bb1fd1..236bf223e259 100644 Binary files a/main/.doctrees/getting-started/tutorials/09-persistent-matmul.doctree and b/main/.doctrees/getting-started/tutorials/09-persistent-matmul.doctree differ diff --git a/main/.doctrees/getting-started/tutorials/sg_execution_times.doctree b/main/.doctrees/getting-started/tutorials/sg_execution_times.doctree index fee1524e6659..9a4f49a74d9d 100644 Binary files a/main/.doctrees/getting-started/tutorials/sg_execution_times.doctree and b/main/.doctrees/getting-started/tutorials/sg_execution_times.doctree differ diff --git a/main/.doctrees/sg_execution_times.doctree b/main/.doctrees/sg_execution_times.doctree index a17a982a7dc8..ce9951db9ae3 100644 Binary files a/main/.doctrees/sg_execution_times.doctree and b/main/.doctrees/sg_execution_times.doctree differ diff --git a/main/_downloads/032b2a144fc26b286cf422d1aecab3b6/05-layer-norm.zip b/main/_downloads/032b2a144fc26b286cf422d1aecab3b6/05-layer-norm.zip index e681d265d10d..51ceffe3ef48 100644 Binary files a/main/_downloads/032b2a144fc26b286cf422d1aecab3b6/05-layer-norm.zip and b/main/_downloads/032b2a144fc26b286cf422d1aecab3b6/05-layer-norm.zip differ diff --git a/main/_downloads/3cf54e8aaddcfce69d180b77518fd544/07-extern-functions.zip b/main/_downloads/3cf54e8aaddcfce69d180b77518fd544/07-extern-functions.zip index 89213bfd8efc..3522a8d021ef 100644 Binary files a/main/_downloads/3cf54e8aaddcfce69d180b77518fd544/07-extern-functions.zip and b/main/_downloads/3cf54e8aaddcfce69d180b77518fd544/07-extern-functions.zip differ diff --git a/main/_downloads/4e511f795844d864249b83f016d8ce09/01-vector-add.zip b/main/_downloads/4e511f795844d864249b83f016d8ce09/01-vector-add.zip index 9b7007840fa9..48332986792b 100644 Binary files a/main/_downloads/4e511f795844d864249b83f016d8ce09/01-vector-add.zip and b/main/_downloads/4e511f795844d864249b83f016d8ce09/01-vector-add.zip differ diff --git a/main/_downloads/509ee5615f08cb2e7336a73c0511fba2/06-fused-attention.zip b/main/_downloads/509ee5615f08cb2e7336a73c0511fba2/06-fused-attention.zip index 050473c02fa2..e877cb6538d0 100644 Binary files a/main/_downloads/509ee5615f08cb2e7336a73c0511fba2/06-fused-attention.zip and b/main/_downloads/509ee5615f08cb2e7336a73c0511fba2/06-fused-attention.zip differ diff --git a/main/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip b/main/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip index d07c1d08a126..61a0f03058a4 100644 Binary files a/main/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip and b/main/_downloads/662999063954282841dc90b8945f85ce/tutorials_jupyter.zip differ diff --git a/main/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip b/main/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip index f7e2dbd05a8a..6b5adb69dadf 100644 Binary files a/main/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip and b/main/_downloads/763344228ae6bc253ed1a6cf586aa30d/tutorials_python.zip differ diff --git a/main/_downloads/9241eab99db7582ceb6cd81f77524214/04-low-memory-dropout.zip b/main/_downloads/9241eab99db7582ceb6cd81f77524214/04-low-memory-dropout.zip index 20368806bd53..b074415a8d1b 100644 Binary files a/main/_downloads/9241eab99db7582ceb6cd81f77524214/04-low-memory-dropout.zip and b/main/_downloads/9241eab99db7582ceb6cd81f77524214/04-low-memory-dropout.zip differ diff --git a/main/_downloads/a13fad8f9b954e9e27a17a2c27aee1b8/09-persistent-matmul.zip b/main/_downloads/a13fad8f9b954e9e27a17a2c27aee1b8/09-persistent-matmul.zip index 856ce552bfc4..f158b9fcb814 100644 Binary files a/main/_downloads/a13fad8f9b954e9e27a17a2c27aee1b8/09-persistent-matmul.zip and b/main/_downloads/a13fad8f9b954e9e27a17a2c27aee1b8/09-persistent-matmul.zip differ diff --git a/main/_downloads/d74d81ab958b2efbd334253f48cdb202/03-matrix-multiplication.zip b/main/_downloads/d74d81ab958b2efbd334253f48cdb202/03-matrix-multiplication.zip index 61da938edc7a..97677e3208e5 100644 Binary files a/main/_downloads/d74d81ab958b2efbd334253f48cdb202/03-matrix-multiplication.zip and b/main/_downloads/d74d81ab958b2efbd334253f48cdb202/03-matrix-multiplication.zip differ diff --git a/main/_downloads/f66de4fbee2c4ba20b6f7f3ae99f7de3/02-fused-softmax.zip b/main/_downloads/f66de4fbee2c4ba20b6f7f3ae99f7de3/02-fused-softmax.zip index 37a1833ca88b..53cf9e747d5f 100644 Binary files a/main/_downloads/f66de4fbee2c4ba20b6f7f3ae99f7de3/02-fused-softmax.zip and b/main/_downloads/f66de4fbee2c4ba20b6f7f3ae99f7de3/02-fused-softmax.zip differ diff --git a/main/_downloads/fecddac383ee03c4c47e2cf2ec91448a/08-grouped-gemm.zip b/main/_downloads/fecddac383ee03c4c47e2cf2ec91448a/08-grouped-gemm.zip index 767cbd0bf727..921a59dee2b2 100644 Binary files a/main/_downloads/fecddac383ee03c4c47e2cf2ec91448a/08-grouped-gemm.zip and b/main/_downloads/fecddac383ee03c4c47e2cf2ec91448a/08-grouped-gemm.zip differ diff --git a/main/_images/sphx_glr_01-vector-add_001.png b/main/_images/sphx_glr_01-vector-add_001.png index eb9713cfb9ed..338b6bb34a15 100644 Binary files a/main/_images/sphx_glr_01-vector-add_001.png and b/main/_images/sphx_glr_01-vector-add_001.png differ diff --git a/main/_images/sphx_glr_01-vector-add_thumb.png b/main/_images/sphx_glr_01-vector-add_thumb.png index 6eed222b4b24..b27c970d4026 100644 Binary files a/main/_images/sphx_glr_01-vector-add_thumb.png and b/main/_images/sphx_glr_01-vector-add_thumb.png differ diff --git a/main/_images/sphx_glr_02-fused-softmax_001.png b/main/_images/sphx_glr_02-fused-softmax_001.png index fa7312877b06..343c496d30db 100644 Binary files a/main/_images/sphx_glr_02-fused-softmax_001.png and b/main/_images/sphx_glr_02-fused-softmax_001.png differ diff --git a/main/_images/sphx_glr_02-fused-softmax_thumb.png b/main/_images/sphx_glr_02-fused-softmax_thumb.png index f62ecef78dd8..6482a246aca1 100644 Binary files a/main/_images/sphx_glr_02-fused-softmax_thumb.png and b/main/_images/sphx_glr_02-fused-softmax_thumb.png differ diff --git a/main/_images/sphx_glr_03-matrix-multiplication_001.png b/main/_images/sphx_glr_03-matrix-multiplication_001.png index 9899e8276701..230b94c20cd2 100644 Binary files a/main/_images/sphx_glr_03-matrix-multiplication_001.png and b/main/_images/sphx_glr_03-matrix-multiplication_001.png differ diff --git a/main/_images/sphx_glr_03-matrix-multiplication_002.png b/main/_images/sphx_glr_03-matrix-multiplication_002.png index 1d519308ba61..b2b03b0412f4 100644 Binary files a/main/_images/sphx_glr_03-matrix-multiplication_002.png and b/main/_images/sphx_glr_03-matrix-multiplication_002.png differ diff --git a/main/_images/sphx_glr_03-matrix-multiplication_thumb.png b/main/_images/sphx_glr_03-matrix-multiplication_thumb.png index 2c56fb5e9469..a4674265a023 100644 Binary files a/main/_images/sphx_glr_03-matrix-multiplication_thumb.png and b/main/_images/sphx_glr_03-matrix-multiplication_thumb.png differ diff --git a/main/_images/sphx_glr_05-layer-norm_001.png b/main/_images/sphx_glr_05-layer-norm_001.png index 83231a9a0a9a..492c96b2b53c 100644 Binary files a/main/_images/sphx_glr_05-layer-norm_001.png and b/main/_images/sphx_glr_05-layer-norm_001.png differ diff --git a/main/_images/sphx_glr_05-layer-norm_thumb.png b/main/_images/sphx_glr_05-layer-norm_thumb.png index fdf25bffab89..4e2b3e104001 100644 Binary files a/main/_images/sphx_glr_05-layer-norm_thumb.png and b/main/_images/sphx_glr_05-layer-norm_thumb.png differ diff --git a/main/_images/sphx_glr_06-fused-attention_001.png b/main/_images/sphx_glr_06-fused-attention_001.png index b5f6609e6695..93d46b0c8855 100644 Binary files a/main/_images/sphx_glr_06-fused-attention_001.png and b/main/_images/sphx_glr_06-fused-attention_001.png differ diff --git a/main/_images/sphx_glr_06-fused-attention_002.png b/main/_images/sphx_glr_06-fused-attention_002.png index 9ca99fe26795..e7ebd0e20aa8 100644 Binary files a/main/_images/sphx_glr_06-fused-attention_002.png and b/main/_images/sphx_glr_06-fused-attention_002.png differ diff --git a/main/_images/sphx_glr_06-fused-attention_003.png b/main/_images/sphx_glr_06-fused-attention_003.png index 6d77159ce76f..b0175aac0a69 100644 Binary files a/main/_images/sphx_glr_06-fused-attention_003.png and b/main/_images/sphx_glr_06-fused-attention_003.png differ diff --git a/main/_images/sphx_glr_06-fused-attention_thumb.png b/main/_images/sphx_glr_06-fused-attention_thumb.png index cf247637c767..eeab10250624 100644 Binary files a/main/_images/sphx_glr_06-fused-attention_thumb.png and b/main/_images/sphx_glr_06-fused-attention_thumb.png differ diff --git a/main/_images/sphx_glr_08-grouped-gemm_001.png b/main/_images/sphx_glr_08-grouped-gemm_001.png index e233583d42c2..dacc00993088 100644 Binary files a/main/_images/sphx_glr_08-grouped-gemm_001.png and b/main/_images/sphx_glr_08-grouped-gemm_001.png differ diff --git a/main/_images/sphx_glr_08-grouped-gemm_thumb.png b/main/_images/sphx_glr_08-grouped-gemm_thumb.png index a47fe528c2c0..c040684e3b3d 100644 Binary files a/main/_images/sphx_glr_08-grouped-gemm_thumb.png and b/main/_images/sphx_glr_08-grouped-gemm_thumb.png differ diff --git a/main/_sources/getting-started/tutorials/01-vector-add.rst.txt b/main/_sources/getting-started/tutorials/01-vector-add.rst.txt index 1fa3aaa5667b..514ea583b6c5 100644 --- a/main/_sources/getting-started/tutorials/01-vector-add.rst.txt +++ b/main/_sources/getting-started/tutorials/01-vector-add.rst.txt @@ -232,20 +232,20 @@ We can now run the decorated function above. Pass `print_data=True` to see the p size Triton Torch 0 4096.0 8.000000 8.000000 1 8192.0 15.999999 15.999999 - 2 16384.0 38.400001 31.999999 + 2 16384.0 31.999999 31.999999 3 32768.0 63.999998 63.999998 4 65536.0 127.999995 127.999995 5 131072.0 219.428568 219.428568 6 262144.0 384.000001 384.000001 7 524288.0 614.400016 614.400016 8 1048576.0 819.200021 819.200021 - 9 2097152.0 1023.999964 1023.999964 - 10 4194304.0 1260.307736 1228.800031 + 9 2097152.0 1068.521715 1023.999964 + 10 4194304.0 1228.800031 1260.307736 11 8388608.0 1424.695621 1424.695621 12 16777216.0 1560.380965 1560.380965 13 33554432.0 1631.601649 1624.859540 14 67108864.0 1669.706983 1662.646960 - 15 134217728.0 1685.813499 1678.616907 + 15 134217728.0 1684.008546 1680.410210 @@ -253,7 +253,7 @@ We can now run the decorated function above. Pass `print_data=True` to see the p .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 6.655 seconds) + **Total running time of the script:** (0 minutes 8.085 seconds) .. _sphx_glr_download_getting-started_tutorials_01-vector-add.py: diff --git a/main/_sources/getting-started/tutorials/02-fused-softmax.rst.txt b/main/_sources/getting-started/tutorials/02-fused-softmax.rst.txt index f3e15020367e..73b3125d74b2 100644 --- a/main/_sources/getting-started/tutorials/02-fused-softmax.rst.txt +++ b/main/_sources/getting-started/tutorials/02-fused-softmax.rst.txt @@ -330,104 +330,104 @@ We will then compare its performance against (1) :code:`torch.softmax` and (2) t softmax-performance: N Triton Torch - 0 256.0 483.504849 705.858619 - 1 384.0 609.909256 819.825104 - 2 512.0 752.501069 924.816481 - 3 640.0 797.672563 963.880055 - 4 768.0 881.726191 1018.420477 - 5 896.0 929.768442 1064.831875 - 6 1024.0 999.494799 1122.001862 - 7 1152.0 1102.558978 614.441742 - 8 1280.0 1142.090409 669.986852 - 9 1408.0 1168.353519 725.768367 - 10 1536.0 1193.606858 780.036253 - 11 1664.0 1211.625586 816.613371 - 12 1792.0 1234.005969 857.585530 - 13 1920.0 1257.909453 907.932932 - 14 2048.0 1279.372923 953.199062 - 15 2176.0 1256.866836 976.941599 - 16 2304.0 1272.111609 1007.991351 - 17 2432.0 1293.568044 1053.980286 - 18 2560.0 1302.443672 1084.044097 - 19 2688.0 1310.543069 1100.064422 - 20 2816.0 1329.550121 1132.845900 - 21 2944.0 1325.106103 1165.101730 - 22 3072.0 1354.263898 1181.937432 - 23 3200.0 1356.768765 1196.734630 - 24 3328.0 1361.569081 1223.640447 - 25 3456.0 1372.260392 1251.372527 - 26 3584.0 1377.824967 1263.990813 - 27 3712.0 1388.452077 1269.160168 - 28 3840.0 1390.195771 1300.371636 - 29 3968.0 1392.128272 1316.642004 - 30 4096.0 1401.440985 1325.993289 - 31 4224.0 1335.750716 1161.324964 - 32 4352.0 1341.910848 1174.716639 - 33 4480.0 1357.279725 1184.106737 - 34 4608.0 1363.505422 1194.268459 - 35 4736.0 1355.972089 1202.458558 - 36 4864.0 1376.545694 1221.469200 - 37 4992.0 1371.193311 1233.980250 - 38 5120.0 1372.928619 1249.661093 - 39 5248.0 1376.524535 1256.566657 - 40 5376.0 1380.404894 1286.955387 - 41 5504.0 1376.185871 1296.784498 - 42 5632.0 1387.308569 1316.781373 - 43 5760.0 1394.788007 1325.776437 - 44 5888.0 1393.254701 1340.097197 - 45 6016.0 1397.522942 1351.778948 - 46 6144.0 1407.036902 1372.097123 - 47 6272.0 1415.862255 1374.950602 - 48 6400.0 1412.979892 1391.500644 - 49 6528.0 1413.225499 1392.287672 - 50 6656.0 1419.501659 1401.350961 - 51 6784.0 1414.210591 1417.375134 - 52 6912.0 1427.230434 1425.416165 - 53 7040.0 1415.621263 1433.818007 - 54 7168.0 1427.284870 1435.603462 - 55 7296.0 1433.278373 1442.232505 - 56 7424.0 1429.853412 1448.062699 - 57 7552.0 1430.825507 1455.662158 - 58 7680.0 1436.538042 1462.944463 - 59 7808.0 1432.563937 1465.515737 - 60 7936.0 1436.234983 1468.974307 - 61 8064.0 1438.197861 1472.449191 - 62 8192.0 1438.820876 1484.991319 - 63 8320.0 1388.828363 1400.923094 - 64 8448.0 1382.080815 1404.912243 - 65 8576.0 1397.828756 1398.077460 - 66 8704.0 1389.103978 1401.588898 - 67 8832.0 1388.464599 1406.114984 - 68 8960.0 1394.242119 1411.884544 - 69 9088.0 1406.154896 1416.931387 - 70 9216.0 1401.332416 1424.186015 - 71 9344.0 1399.101498 1425.765896 - 72 9472.0 1397.400184 1437.613232 - 73 9600.0 1397.414486 1429.620929 - 74 9728.0 1404.383086 1443.278039 - 75 9856.0 1414.338572 1442.781395 - 76 9984.0 1400.183684 1449.253765 - 77 10112.0 1413.006160 1455.536657 - 78 10240.0 1420.935224 1468.628917 - 79 10368.0 1412.594682 1463.525008 - 80 10496.0 1412.404248 1466.541052 - 81 10624.0 1411.253194 1469.755374 - 82 10752.0 1403.887393 1470.905480 - 83 10880.0 1401.058090 1481.231802 - 84 11008.0 1418.032552 1476.656156 - 85 11136.0 1423.022405 1483.522921 - 86 11264.0 1426.833981 1487.372540 - 87 11392.0 1415.976223 1489.733547 - 88 11520.0 1420.858834 1493.378907 - 89 11648.0 1428.996384 1496.158752 - 90 11776.0 1429.987493 1501.891181 - 91 11904.0 1441.947990 1504.710219 - 92 12032.0 1420.270029 1508.437638 - 93 12160.0 1420.123557 1509.548090 - 94 12288.0 1434.772004 1390.825795 - 95 12416.0 1447.592587 1389.969032 - 96 12544.0 1443.655266 1391.978887 - 97 12672.0 1449.629836 1395.389754 + 0 256.0 483.648575 704.279422 + 1 384.0 611.207871 811.310482 + 2 512.0 760.029946 930.244010 + 3 640.0 789.112633 962.289683 + 4 768.0 885.260387 1014.476488 + 5 896.0 930.769527 1075.646692 + 6 1024.0 1000.412757 1115.599438 + 7 1152.0 1109.040519 610.401148 + 8 1280.0 1149.123110 671.226388 + 9 1408.0 1154.278795 720.621389 + 10 1536.0 1193.883514 778.498289 + 11 1664.0 1210.021635 814.654418 + 12 1792.0 1242.793808 859.324356 + 13 1920.0 1254.580283 908.801326 + 14 2048.0 1275.278633 959.301903 + 15 2176.0 1263.968962 973.449375 + 16 2304.0 1266.587365 1008.539223 + 17 2432.0 1298.087764 1057.024314 + 18 2560.0 1298.588250 1088.941346 + 19 2688.0 1318.063968 1100.100149 + 20 2816.0 1327.522874 1131.238323 + 21 2944.0 1323.948838 1168.479819 + 22 3072.0 1354.041176 1183.863854 + 23 3200.0 1358.514156 1194.711058 + 24 3328.0 1354.976383 1225.274576 + 25 3456.0 1371.850435 1244.211621 + 26 3584.0 1372.984323 1256.138600 + 27 3712.0 1387.329527 1274.311099 + 28 3840.0 1388.664938 1303.713374 + 29 3968.0 1393.204422 1313.355077 + 30 4096.0 1403.886883 1325.316368 + 31 4224.0 1337.103991 1158.609778 + 32 4352.0 1335.716353 1173.812661 + 33 4480.0 1356.217955 1183.574263 + 34 4608.0 1362.450812 1193.809073 + 35 4736.0 1357.293947 1201.733861 + 36 4864.0 1379.159459 1221.861291 + 37 4992.0 1373.826137 1234.229426 + 38 5120.0 1377.862279 1249.951049 + 39 5248.0 1377.333230 1257.339477 + 40 5376.0 1379.883175 1285.170192 + 41 5504.0 1375.907088 1294.483712 + 42 5632.0 1384.322144 1315.326958 + 43 5760.0 1389.700975 1323.655303 + 44 5888.0 1388.358894 1340.965603 + 45 6016.0 1396.791808 1357.890048 + 46 6144.0 1407.352454 1376.836366 + 47 6272.0 1416.510509 1373.925271 + 48 6400.0 1417.547916 1390.114938 + 49 6528.0 1412.386919 1394.558616 + 50 6656.0 1429.171955 1404.118948 + 51 6784.0 1414.168228 1414.152945 + 52 6912.0 1428.677389 1423.545837 + 53 7040.0 1419.135908 1431.027028 + 54 7168.0 1428.190246 1432.573337 + 55 7296.0 1430.990750 1442.788896 + 56 7424.0 1432.261790 1448.247037 + 57 7552.0 1429.992641 1455.168665 + 58 7680.0 1434.263134 1461.397513 + 59 7808.0 1434.109985 1464.693503 + 60 7936.0 1438.390409 1469.118869 + 61 8064.0 1441.079679 1476.328321 + 62 8192.0 1439.713319 1484.101664 + 63 8320.0 1389.698454 1401.500977 + 64 8448.0 1379.724473 1404.643597 + 65 8576.0 1397.012595 1396.208756 + 66 8704.0 1390.381255 1402.374490 + 67 8832.0 1381.410375 1402.728397 + 68 8960.0 1399.763867 1412.439691 + 69 9088.0 1409.690151 1417.513298 + 70 9216.0 1401.348792 1425.147763 + 71 9344.0 1398.078238 1423.023168 + 72 9472.0 1397.977452 1433.206099 + 73 9600.0 1393.409483 1431.106833 + 74 9728.0 1400.878586 1441.134502 + 75 9856.0 1411.871327 1441.104112 + 76 9984.0 1399.378512 1453.973620 + 77 10112.0 1413.104126 1455.984016 + 78 10240.0 1424.171473 1469.898677 + 79 10368.0 1412.048415 1465.451919 + 80 10496.0 1414.760017 1469.418293 + 81 10624.0 1411.783049 1466.482982 + 82 10752.0 1408.101565 1473.308784 + 83 10880.0 1402.392760 1483.867742 + 84 11008.0 1420.196950 1476.739754 + 85 11136.0 1422.397196 1485.601988 + 86 11264.0 1428.979758 1487.306093 + 87 11392.0 1414.354964 1489.877609 + 88 11520.0 1423.180249 1496.093741 + 89 11648.0 1428.865810 1497.329329 + 90 11776.0 1431.690275 1499.983541 + 91 11904.0 1442.327917 1506.970568 + 92 12032.0 1423.804221 1507.745673 + 93 12160.0 1421.960002 1512.223550 + 94 12288.0 1436.623032 1393.599156 + 95 12416.0 1447.158770 1390.654769 + 96 12544.0 1442.963517 1393.886753 + 97 12672.0 1445.889735 1394.097025 @@ -442,7 +442,7 @@ In the above plot, we can see that: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 23.233 seconds) + **Total running time of the script:** (0 minutes 23.220 seconds) .. _sphx_glr_download_getting-started_tutorials_02-fused-softmax.py: diff --git a/main/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt b/main/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt index d86067a2d7fa..5c3fc7b268b1 100644 --- a/main/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt +++ b/main/_sources/getting-started/tutorials/03-matrix-multiplication.rst.txt @@ -575,31 +575,31 @@ but feel free to arrange this script as you wish to benchmark any other matrix s 3 640.0 640.0 640.0 42.666665 42.666665 4 768.0 768.0 768.0 63.195428 68.056616 5 896.0 896.0 896.0 78.051553 93.661869 - 6 1024.0 1024.0 1024.0 104.857603 104.857603 + 6 1024.0 1024.0 1024.0 110.376426 104.857603 7 1152.0 1152.0 1152.0 135.726544 129.825388 8 1280.0 1280.0 1280.0 157.538463 163.840004 9 1408.0 1408.0 1408.0 155.765024 132.970149 - 10 1536.0 1536.0 1536.0 176.947204 153.867127 - 11 1664.0 1664.0 1664.0 183.651271 179.978245 + 10 1536.0 1536.0 1536.0 176.947204 157.286398 + 11 1664.0 1664.0 1664.0 179.978245 179.978245 12 1792.0 1792.0 1792.0 172.914215 208.137481 - 13 1920.0 1920.0 1920.0 200.347822 166.554219 + 13 1920.0 1920.0 1920.0 200.347822 168.585369 14 2048.0 2048.0 2048.0 226.719125 192.841562 - 15 2176.0 2176.0 2176.0 211.827867 209.621326 - 16 2304.0 2304.0 2304.0 229.691080 231.921091 - 17 2432.0 2432.0 2432.0 205.069087 202.118452 - 18 2560.0 2560.0 2560.0 224.438347 218.453323 - 19 2688.0 2688.0 2688.0 200.704002 198.602388 - 20 2816.0 2816.0 2816.0 212.752230 207.686706 - 21 2944.0 2944.0 2944.0 220.513412 222.482283 - 22 3072.0 3072.0 3072.0 210.494802 212.868821 - 23 3200.0 3200.0 3200.0 218.430042 219.178074 - 24 3328.0 3328.0 3328.0 209.887165 209.887165 - 25 3456.0 3456.0 3456.0 220.880999 218.486642 - 26 3584.0 3584.0 3584.0 218.772251 215.108588 - 27 3712.0 3712.0 3712.0 208.990259 214.833002 + 15 2176.0 2176.0 2176.0 211.827867 211.827867 + 16 2304.0 2304.0 2304.0 229.691080 227.503545 + 17 2432.0 2432.0 2432.0 206.576938 203.583068 + 18 2560.0 2560.0 2560.0 222.911566 219.919464 + 19 2688.0 2688.0 2688.0 198.602388 199.647657 + 20 2816.0 2816.0 2816.0 212.752230 212.752230 + 21 2944.0 2944.0 2944.0 221.493479 223.479969 + 22 3072.0 3072.0 3072.0 208.941345 211.280236 + 23 3200.0 3200.0 3200.0 216.949149 221.453296 + 24 3328.0 3328.0 3328.0 207.467716 211.118166 + 25 3456.0 3456.0 3456.0 219.677297 219.080343 + 26 3584.0 3584.0 3584.0 216.142772 215.624440 + 27 3712.0 3712.0 3712.0 211.646909 214.833002 28 3840.0 3840.0 3840.0 210.250955 209.851994 - 29 3968.0 3968.0 3968.0 208.945088 217.899880 - 30 4096.0 4096.0 4096.0 220.029067 220.029067 + 29 3968.0 3968.0 3968.0 211.114084 219.467517 + 30 4096.0 4096.0 4096.0 217.180793 220.029067 matmul-performance-fp8: M N K Triton 0 256.0 256.0 256.0 3.276800 @@ -610,7 +610,7 @@ but feel free to arrange this script as you wish to benchmark any other matrix s 5 896.0 896.0 896.0 58.538665 6 1024.0 1024.0 1024.0 61.680940 7 1152.0 1152.0 1152.0 80.702267 - 8 1280.0 1280.0 1280.0 102.400003 + 8 1280.0 1280.0 1280.0 99.902441 9 1408.0 1408.0 1408.0 81.369790 10 1536.0 1536.0 1536.0 98.303997 11 1664.0 1664.0 1664.0 115.370671 @@ -620,19 +620,19 @@ but feel free to arrange this script as you wish to benchmark any other matrix s 15 2176.0 2176.0 2176.0 120.500882 16 2304.0 2304.0 2304.0 134.959733 17 2432.0 2432.0 2432.0 132.521057 - 18 2560.0 2560.0 2560.0 145.635558 - 19 2688.0 2688.0 2688.0 118.171514 + 18 2560.0 2560.0 2560.0 145.959916 + 19 2688.0 2688.0 2688.0 117.077336 20 2816.0 2816.0 2816.0 128.655484 - 21 2944.0 2944.0 2944.0 138.819031 - 22 3072.0 3072.0 3072.0 144.079147 - 23 3200.0 3200.0 3200.0 139.433550 - 24 3328.0 3328.0 3328.0 130.893266 - 25 3456.0 3456.0 3456.0 138.287420 - 26 3584.0 3584.0 3584.0 149.113421 + 21 2944.0 2944.0 2944.0 139.988852 + 22 3072.0 3072.0 3072.0 143.713461 + 23 3200.0 3200.0 3200.0 139.737993 + 24 3328.0 3328.0 3328.0 132.336939 + 25 3456.0 3456.0 3456.0 139.725414 + 26 3584.0 3584.0 3584.0 148.620481 27 3712.0 3712.0 3712.0 142.303911 - 28 3840.0 3840.0 3840.0 137.723536 - 29 3968.0 3968.0 3968.0 147.194128 - 30 4096.0 4096.0 4096.0 154.807064 + 28 3840.0 3840.0 3840.0 137.895263 + 29 3968.0 3968.0 3968.0 147.550102 + 30 4096.0 4096.0 4096.0 155.165002 @@ -640,7 +640,7 @@ but feel free to arrange this script as you wish to benchmark any other matrix s .. rst-class:: sphx-glr-timing - **Total running time of the script:** (2 minutes 17.240 seconds) + **Total running time of the script:** (2 minutes 17.681 seconds) .. _sphx_glr_download_getting-started_tutorials_03-matrix-multiplication.py: diff --git a/main/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt b/main/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt index c755fbdf2f1e..a90eacd34919 100644 --- a/main/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt +++ b/main/_sources/getting-started/tutorials/04-low-memory-dropout.rst.txt @@ -244,7 +244,7 @@ References .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 0.709 seconds) + **Total running time of the script:** (0 minutes 0.695 seconds) .. _sphx_glr_download_getting-started_tutorials_04-low-memory-dropout.py: diff --git a/main/_sources/getting-started/tutorials/05-layer-norm.rst.txt b/main/_sources/getting-started/tutorials/05-layer-norm.rst.txt index 6d7bb122e579..cab2ef9f4975 100644 --- a/main/_sources/getting-started/tutorials/05-layer-norm.rst.txt +++ b/main/_sources/getting-started/tutorials/05-layer-norm.rst.txt @@ -431,36 +431,36 @@ Specifically, one can set :code:`'mode': 'backward'` to benchmark the backward p layer-norm-backward: N Triton Torch - 0 1024.0 120.470590 378.092307 - 1 1536.0 177.230771 444.144584 - 2 2048.0 246.994973 517.389457 - 3 2560.0 305.671638 574.205608 - 4 3072.0 370.492459 614.400016 - 5 3584.0 423.724131 547.872604 - 6 4096.0 491.520012 561.737163 - 7 4608.0 594.580630 573.015544 - 8 5120.0 623.756341 568.888888 - 9 5632.0 682.666686 563.200014 - 10 6144.0 862.315754 562.809189 - 11 6656.0 815.020435 566.468098 - 12 7168.0 961.072652 540.981122 - 13 7680.0 965.026182 548.571433 - 14 8192.0 998.010146 548.418419 + 0 1024.0 127.999995 378.092307 + 1 1536.0 185.246229 449.560983 + 2 2048.0 245.760006 517.389457 + 3 2560.0 308.743716 574.205608 + 4 3072.0 383.999986 614.400016 + 5 3584.0 434.424255 547.872604 + 6 4096.0 561.737163 561.737163 + 7 4608.0 550.208948 567.138460 + 8 5120.0 599.414623 566.267298 + 9 5632.0 637.584909 563.200014 + 10 6144.0 692.281669 562.809189 + 11 6656.0 749.971808 566.468098 + 12 7168.0 839.180472 540.981122 + 13 7680.0 959.999966 548.571433 + 14 8192.0 992.969726 549.184373 15 8704.0 705.729699 561.548373 16 9216.0 747.243211 567.138460 17 9728.0 773.086092 570.836186 - 18 10240.0 782.675148 563.669722 - 19 10752.0 806.400020 554.941947 - 20 11264.0 829.251512 559.701851 - 21 11776.0 821.581395 568.659959 + 18 10240.0 782.675148 562.379850 + 19 10752.0 806.400020 553.751076 + 20 11264.0 831.803075 559.701851 + 21 11776.0 821.581395 567.518063 22 12288.0 842.605744 572.644636 23 12800.0 865.352121 578.531044 - 24 13312.0 912.822890 579.833026 + 24 13312.0 907.636357 579.833026 25 13824.0 908.975307 579.015709 - 26 14336.0 934.956501 569.642383 - 27 14848.0 947.744653 570.163213 - 28 15360.0 957.506513 580.535429 - 29 15872.0 938.246290 581.569482 + 26 14336.0 937.504086 569.642383 + 27 14848.0 950.271992 570.163213 + 28 15360.0 957.506513 580.992908 + 29 15872.0 940.562966 582.458739 @@ -475,7 +475,7 @@ References .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 28.943 seconds) + **Total running time of the script:** (0 minutes 29.250 seconds) .. _sphx_glr_download_getting-started_tutorials_05-layer-norm.py: diff --git a/main/_sources/getting-started/tutorials/06-fused-attention.rst.txt b/main/_sources/getting-started/tutorials/06-fused-attention.rst.txt index a3f90e1c0c6d..b045b99fa4c4 100644 --- a/main/_sources/getting-started/tutorials/06-fused-attention.rst.txt +++ b/main/_sources/getting-started/tutorials/06-fused-attention.rst.txt @@ -63,25 +63,25 @@ Extra Credits: fused-attention-batch4-head32-d64-fwd-causal=True: N_CTX Triton [FP16] Triton [FP8] - 0 1024.0 114.520250 93.979997 - 1 2048.0 141.497268 110.890806 - 2 4096.0 156.485632 124.438908 - 3 8192.0 167.606288 131.898984 - 4 16384.0 174.883649 135.416744 + 0 1024.0 114.253325 93.979200 + 1 2048.0 141.914607 110.081687 + 2 4096.0 155.124249 125.638424 + 3 8192.0 169.098510 132.213495 + 4 16384.0 174.167377 135.770608 fused-attention-batch4-head32-d64-fwd-causal=False: N_CTX Triton [FP16] Triton [FP8] - 0 1024.0 158.443284 129.022867 - 1 2048.0 169.510773 127.448620 - 2 4096.0 170.796126 129.949506 - 3 8192.0 173.925536 135.619699 - 4 16384.0 176.713087 135.513582 + 0 1024.0 158.215806 128.714628 + 1 2048.0 168.736353 130.435120 + 2 4096.0 171.165385 135.189044 + 3 8192.0 173.641020 135.423011 + 4 16384.0 170.087608 136.423963 fused-attention-batch4-head32-d64-bwd-causal=True: N_CTX Triton [FP16] Triton [FP8] - 0 1024.0 73.009913 72.045073 - 1 2048.0 96.163217 96.156390 - 2 4096.0 114.970261 114.891537 - 3 8192.0 126.472846 125.482570 - 4 16384.0 132.178868 131.264283 + 0 1024.0 72.914234 71.978741 + 1 2048.0 97.192045 97.079356 + 2 4096.0 114.859580 115.403991 + 3 8192.0 124.840631 125.993505 + 4 16384.0 132.079690 131.815389 @@ -725,7 +725,7 @@ Extra Credits: .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 54.607 seconds) + **Total running time of the script:** (0 minutes 54.701 seconds) .. _sphx_glr_download_getting-started_tutorials_06-fused-attention.py: diff --git a/main/_sources/getting-started/tutorials/07-extern-functions.rst.txt b/main/_sources/getting-started/tutorials/07-extern-functions.rst.txt index 691410040692..09757d14c8ae 100644 --- a/main/_sources/getting-started/tutorials/07-extern-functions.rst.txt +++ b/main/_sources/getting-started/tutorials/07-extern-functions.rst.txt @@ -169,7 +169,7 @@ We can also customize the libdevice library path by passing the path to the `lib .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 0.228 seconds) + **Total running time of the script:** (0 minutes 0.232 seconds) .. _sphx_glr_download_getting-started_tutorials_07-extern-functions.py: diff --git a/main/_sources/getting-started/tutorials/08-grouped-gemm.rst.txt b/main/_sources/getting-started/tutorials/08-grouped-gemm.rst.txt index a3364265551b..e4a581f3c97f 100644 --- a/main/_sources/getting-started/tutorials/08-grouped-gemm.rst.txt +++ b/main/_sources/getting-started/tutorials/08-grouped-gemm.rst.txt @@ -39,7 +39,7 @@ of gemms. The scheduling is static and we do it on device. group-gemm-performance: N cuBLAS Triton - 0 128.0 0.021504 0.014336 + 0 128.0 0.020480 0.014336 1 256.0 0.023552 0.018432 2 512.0 0.032768 0.027648 3 1024.0 0.071680 0.088064 @@ -336,7 +336,7 @@ of gemms. The scheduling is static and we do it on device. .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 3.528 seconds) + **Total running time of the script:** (0 minutes 3.578 seconds) .. _sphx_glr_download_getting-started_tutorials_08-grouped-gemm.py: diff --git a/main/_sources/getting-started/tutorials/09-persistent-matmul.rst.txt b/main/_sources/getting-started/tutorials/09-persistent-matmul.rst.txt index 16e91f18a849..b62f0c02dfa1 100644 --- a/main/_sources/getting-started/tutorials/09-persistent-matmul.rst.txt +++ b/main/_sources/getting-started/tutorials/09-persistent-matmul.rst.txt @@ -652,7 +652,7 @@ Users can pass command-line arguments to specify matrix dimensions and iteration .. rst-class:: sphx-glr-timing - **Total running time of the script:** (0 minutes 1.872 seconds) + **Total running time of the script:** (0 minutes 1.912 seconds) .. _sphx_glr_download_getting-started_tutorials_09-persistent-matmul.py: diff --git a/main/_sources/getting-started/tutorials/sg_execution_times.rst.txt b/main/_sources/getting-started/tutorials/sg_execution_times.rst.txt index 642010f83f7e..1570aa7912e9 100644 --- a/main/_sources/getting-started/tutorials/sg_execution_times.rst.txt +++ b/main/_sources/getting-started/tutorials/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**04:17.017** total execution time for 9 files **from getting-started/tutorials**: +**04:19.354** total execution time for 9 files **from getting-started/tutorials**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``03-matrix-multiplication.py``) - - 02:17.240 + - 02:17.681 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_06-fused-attention.py` (``06-fused-attention.py``) - - 00:54.607 + - 00:54.701 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``05-layer-norm.py``) - - 00:28.943 + - 00:29.250 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``02-fused-softmax.py``) - - 00:23.233 + - 00:23.220 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``01-vector-add.py``) - - 00:06.655 + - 00:08.085 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_08-grouped-gemm.py` (``08-grouped-gemm.py``) - - 00:03.528 + - 00:03.578 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_09-persistent-matmul.py` (``09-persistent-matmul.py``) - - 00:01.872 + - 00:01.912 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``04-low-memory-dropout.py``) - - 00:00.709 + - 00:00.695 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_07-extern-functions.py` (``07-extern-functions.py``) - - 00:00.228 + - 00:00.232 - 0.0 diff --git a/main/_sources/sg_execution_times.rst.txt b/main/_sources/sg_execution_times.rst.txt index fccacf2d72f7..043b89c80e06 100644 --- a/main/_sources/sg_execution_times.rst.txt +++ b/main/_sources/sg_execution_times.rst.txt @@ -6,7 +6,7 @@ Computation times ================= -**04:17.017** total execution time for 9 files **from all galleries**: +**04:19.354** total execution time for 9 files **from all galleries**: .. container:: @@ -33,29 +33,29 @@ Computation times - Time - Mem (MB) * - :ref:`sphx_glr_getting-started_tutorials_03-matrix-multiplication.py` (``../python/tutorials/03-matrix-multiplication.py``) - - 02:17.240 + - 02:17.681 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_06-fused-attention.py` (``../python/tutorials/06-fused-attention.py``) - - 00:54.607 + - 00:54.701 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_05-layer-norm.py` (``../python/tutorials/05-layer-norm.py``) - - 00:28.943 + - 00:29.250 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_02-fused-softmax.py` (``../python/tutorials/02-fused-softmax.py``) - - 00:23.233 + - 00:23.220 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_01-vector-add.py` (``../python/tutorials/01-vector-add.py``) - - 00:06.655 + - 00:08.085 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_08-grouped-gemm.py` (``../python/tutorials/08-grouped-gemm.py``) - - 00:03.528 + - 00:03.578 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_09-persistent-matmul.py` (``../python/tutorials/09-persistent-matmul.py``) - - 00:01.872 + - 00:01.912 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_04-low-memory-dropout.py` (``../python/tutorials/04-low-memory-dropout.py``) - - 00:00.709 + - 00:00.695 - 0.0 * - :ref:`sphx_glr_getting-started_tutorials_07-extern-functions.py` (``../python/tutorials/07-extern-functions.py``) - - 00:00.228 + - 00:00.232 - 0.0 diff --git a/main/getting-started/tutorials/01-vector-add.html b/main/getting-started/tutorials/01-vector-add.html index 10503a35db2b..179a9a759919 100644 --- a/main/getting-started/tutorials/01-vector-add.html +++ b/main/getting-started/tutorials/01-vector-add.html @@ -243,23 +243,23 @@
Total running time of the script: (0 minutes 8.085 seconds)