From 993cfa55dcae112822a2e9681056e49697d71338 Mon Sep 17 00:00:00 2001 From: Konstantin Date: Tue, 24 Jan 2023 18:29:47 +0100 Subject: [PATCH] Performance results for version 1.5 --- SampleClips/columbia-large-1080ti.txt | 77 ++++++++++++------------ SampleClips/columbia-large-1650.txt | 77 ++++++++++++------------ SampleClips/columbia-large-vega7.txt | 83 +++++++++++++------------- SampleClips/columbia-large-vega8.txt | 83 +++++++++++++------------- SampleClips/columbia-medium-1080ti.txt | 77 ++++++++++++------------ SampleClips/columbia-medium-1650.txt | 77 ++++++++++++------------ SampleClips/columbia-medium-vega7.txt | 83 +++++++++++++------------- SampleClips/columbia-medium-vega8.txt | 83 +++++++++++++------------- SampleClips/jfk-large-1080ti.txt | 77 ++++++++++++------------ SampleClips/jfk-large-1650.txt | 77 ++++++++++++------------ SampleClips/jfk-large-vega7.txt | 83 +++++++++++++------------- SampleClips/jfk-large-vega8.txt | 83 +++++++++++++------------- SampleClips/jfk-medium-1080ti.txt | 77 ++++++++++++------------ SampleClips/jfk-medium-1650.txt | 77 ++++++++++++------------ SampleClips/jfk-medium-vega7.txt | 83 +++++++++++++------------- SampleClips/jfk-medium-vega8.txt | 83 +++++++++++++------------- 16 files changed, 648 insertions(+), 632 deletions(-) diff --git a/SampleClips/columbia-large-1080ti.txt b/SampleClips/columbia-large-1080ti.txt index a15ff4e..babf33c 100644 --- a/SampleClips/columbia-large-1080ti.txt +++ b/SampleClips/columbia-large-1080ti.txt @@ -1,43 +1,44 @@  CPU Tasks -LoadModel 6.69478 seconds -RunComplete 33.7046 seconds -Run 33.637 seconds -Callbacks 12.7347 milliseconds, 44 calls, 289.425 microseconds average -Spectrogram 679.962 milliseconds, 41 calls, 16.5844 milliseconds average -Sample 64.9643 milliseconds, 527 calls, 123.272 microseconds average -Encode 13.5814 seconds, 9 calls, 1.50905 seconds average -Decode 20.0426 seconds, 9 calls, 2.22696 seconds average -DecodeStep 19.9774 seconds, 527 calls, 37.9077 milliseconds average +LoadModel 950.578 milliseconds +RunComplete 27.5329 seconds +Run 27.434 seconds +Callbacks 10.6484 milliseconds, 44 calls, 242.009 microseconds average +Spectrogram 199.106 milliseconds, 41 calls, 4.85624 milliseconds average +Sample 58.7404 milliseconds, 527 calls, 111.462 microseconds average +Encode 11.3813 seconds, 9 calls, 1.26459 seconds average +Decode 16.0418 seconds, 9 calls, 1.78242 seconds average +DecodeStep 15.9829 seconds, 527 calls, 30.3281 milliseconds average GPU Tasks -LoadModel 6.50695 seconds -Run 33.4847 seconds -Encode 13.6283 seconds, 9 calls, 1.51426 seconds average -EncodeLayer 11.6754 seconds, 288 calls, 40.5397 milliseconds average -Decode 19.8563 seconds, 9 calls, 2.20626 seconds average -DecodeStep 19.8559 seconds, 527 calls, 37.6773 milliseconds average -DecodeLayer 18.5337 seconds, 16864 calls, 1.09901 milliseconds average +LoadModel 805.211 milliseconds +Run 27.338 seconds +Encode 11.3967 seconds, 9 calls, 1.2663 seconds average +EncodeLayer 9.78685 seconds, 288 calls, 33.9821 milliseconds average +Decode 15.9412 seconds, 9 calls, 1.77125 seconds average +DecodeStep 15.9412 seconds, 527 calls, 30.249 milliseconds average +DecodeLayer 15.0511 seconds, 16864 calls, 892.499 microseconds average Compute Shaders -mulMatTiled 14.6726 seconds, 6345 calls, 2.31247 milliseconds average -mulMatByRowTiled 11.8939 seconds, 199430 calls, 59.6393 microseconds average -norm 1.3396 seconds, 51704 calls, 25.909 microseconds average -softMax 858.923 milliseconds, 17391 calls, 49.3889 microseconds average -addRepeat 792.962 milliseconds, 68896 calls, 11.5096 microseconds average -fmaRepeat1 567.753 milliseconds, 51704 calls, 10.9808 microseconds average -copyConvert 541.081 milliseconds, 34880 calls, 15.5126 microseconds average -softMaxFixed 523.378 milliseconds, 17152 calls, 30.5141 microseconds average -copyTranspose 422.677 milliseconds, 34304 calls, 12.3215 microseconds average -addRepeatScale 329.963 milliseconds, 33728 calls, 9.78305 microseconds average -addInPlace 306.328 milliseconds, 34304 calls, 8.92981 microseconds average -addRepeatGelu 290.074 milliseconds, 17170 calls, 16.8942 microseconds average -scaleInPlace 237.756 milliseconds, 17152 calls, 13.8617 microseconds average -add 196.816 milliseconds, 16873 calls, 11.6645 microseconds average -convolutionMain2Fixed 187.457 milliseconds, 9 calls, 20.8285 milliseconds average -diagMaskInf 103.247 milliseconds, 16864 calls, 6.12231 microseconds average -convolutionMain 75.5589 milliseconds, 9 calls, 8.39543 milliseconds average -convolutionPrep1 21.4927 milliseconds, 18 calls, 1.19404 milliseconds average -addRows 9.2908 milliseconds, 527 calls, 17.6296 microseconds average -convolutionPrep2 5.0944 milliseconds, 18 calls, 283.022 microseconds average +mulMatTiled 12.0503 seconds, 6345 calls, 1.89919 milliseconds average +mulMatByRowTiled 9.45404 seconds, 199430 calls, 47.4053 microseconds average +norm 1.32432 seconds, 51704 calls, 25.6135 microseconds average +fmaRepeat1 583.884 milliseconds, 51704 calls, 11.2928 microseconds average +addRepeatEx 536.551 milliseconds, 51168 calls, 10.4861 microseconds average +softMaxFixed 534.105 milliseconds, 17152 calls, 31.1395 microseconds average +copyConvert 500.4 milliseconds, 34880 calls, 14.3463 microseconds average +copyTranspose 377.38 milliseconds, 34304 calls, 11.001 microseconds average +addRepeatScale 315.294 milliseconds, 33728 calls, 9.34814 microseconds average +addRepeatGelu 283.978 milliseconds, 17170 calls, 16.5392 microseconds average +softMaxLong 245.57 milliseconds, 527 calls, 465.976 microseconds average +scaleInPlace 226.545 milliseconds, 17152 calls, 13.2081 microseconds average +softMax 212.206 milliseconds, 16864 calls, 12.5834 microseconds average +addRepeat 209.397 milliseconds, 17728 calls, 11.8117 microseconds average +convolutionMain2Fixed 184.615 milliseconds, 9 calls, 20.5128 milliseconds average +diagMaskInf 107.423 milliseconds, 16864 calls, 6.36998 microseconds average +convolutionMain 74.7954 milliseconds, 9 calls, 8.3106 milliseconds average +convolutionPrep1 20.9316 milliseconds, 18 calls, 1.16287 milliseconds average +convolutionPrep2 3.8103 milliseconds, 18 calls, 211.683 microseconds average +addRows 3.7939 milliseconds, 527 calls, 7.19905 microseconds average +add 1.0895 milliseconds, 9 calls, 121.056 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 92.2616 MB RAM, 1.20719 GB VRAM -Total 93.1333 MB RAM, 4.08869 GB VRAM +Context 92.2612 MB RAM, 1.14026 GB VRAM +Total 93.1329 MB RAM, 4.02176 GB VRAM diff --git a/SampleClips/columbia-large-1650.txt b/SampleClips/columbia-large-1650.txt index ffc9e5c..0ff0edb 100644 --- a/SampleClips/columbia-large-1650.txt +++ b/SampleClips/columbia-large-1650.txt @@ -1,43 +1,44 @@ CPU Tasks -LoadModel 1.39046 seconds -RunComplete 98.7705 seconds -Run 98.6893 seconds -Callbacks 10.9446 milliseconds, 44 calls, 248.741 microseconds average -Spectrogram 1.10864 seconds, 41 calls, 27.04 milliseconds average -Sample 62.5537 milliseconds, 527 calls, 118.698 microseconds average -Encode 60.6321 seconds, 9 calls, 6.7369 seconds average -Decode 38.0118 seconds, 9 calls, 4.22353 seconds average -DecodeStep 37.949 seconds, 527 calls, 72.0095 milliseconds average +LoadModel 7.95251 seconds +RunComplete 109.423 seconds +Run 109.351 seconds +Callbacks 12.7226 milliseconds, 44 calls, 289.15 microseconds average +Spectrogram 270.286 milliseconds, 41 calls, 6.59235 milliseconds average +Sample 69.0965 milliseconds, 527 calls, 131.113 microseconds average +Encode 35.943 seconds, 9 calls, 3.99366 seconds average +Decode 73.3946 seconds, 9 calls, 8.15496 seconds average +DecodeStep 73.3251 seconds, 527 calls, 139.137 milliseconds average GPU Tasks -LoadModel 1.19991 seconds -Run 98.4248 seconds -Encode 61.0298 seconds, 9 calls, 6.78109 seconds average -EncodeLayer 51.7844 seconds, 288 calls, 179.807 milliseconds average -Decode 37.395 seconds, 9 calls, 4.155 seconds average -DecodeStep 37.3947 seconds, 527 calls, 70.9577 milliseconds average -DecodeLayer 34.8821 seconds, 16864 calls, 2.06843 milliseconds average +LoadModel 7.55659 seconds +Run 109.16 seconds +Encode 36.3141 seconds, 9 calls, 4.0349 seconds average +EncodeLayer 29.8405 seconds, 288 calls, 103.613 milliseconds average +Decode 72.8459 seconds, 9 calls, 8.09398 seconds average +DecodeStep 72.8458 seconds, 527 calls, 138.227 milliseconds average +DecodeLayer 69.0153 seconds, 16864 calls, 4.09247 milliseconds average Compute Shaders -mulMatTiled 65.2919 seconds, 6345 calls, 10.2903 milliseconds average -mulMatByRowTiled 22.3701 seconds, 199430 calls, 112.17 microseconds average -convolutionMain2Fixed 1.37801 seconds, 9 calls, 153.113 milliseconds average -softMaxFixed 1.32519 seconds, 17152 calls, 77.2618 microseconds average -addRepeat 1.0237 seconds, 68896 calls, 14.8586 microseconds average -copyTranspose 974.149 milliseconds, 34304 calls, 28.3975 microseconds average -norm 971.572 milliseconds, 51704 calls, 18.791 microseconds average -softMax 956.611 milliseconds, 17391 calls, 55.0061 microseconds average -copyConvert 899.362 milliseconds, 34880 calls, 25.7845 microseconds average -fmaRepeat1 675.729 milliseconds, 51704 calls, 13.0692 microseconds average -addRepeatGelu 531.623 milliseconds, 17170 calls, 30.9623 microseconds average -addInPlace 461.61 milliseconds, 34304 calls, 13.4564 microseconds average -scaleInPlace 394.457 milliseconds, 17152 calls, 22.9978 microseconds average -convolutionMain 331.124 milliseconds, 9 calls, 36.7915 milliseconds average -addRepeatScale 329.854 milliseconds, 33728 calls, 9.77983 microseconds average -add 203.376 milliseconds, 16873 calls, 12.0534 microseconds average -diagMaskInf 107.127 milliseconds, 16864 calls, 6.3524 microseconds average -convolutionPrep1 58.8876 milliseconds, 18 calls, 3.27153 milliseconds average -convolutionPrep2 9.1367 milliseconds, 18 calls, 507.594 microseconds average -addRows 3.6551 milliseconds, 527 calls, 6.93567 microseconds average +mulMatTiled 36.8159 seconds, 6345 calls, 5.80234 milliseconds average +mulMatByRowTiled 28.0431 seconds, 199430 calls, 140.616 microseconds average +copyTranspose 8.11917 seconds, 34304 calls, 236.683 microseconds average +fmaRepeat1 7.85961 seconds, 51704 calls, 152.012 microseconds average +addRepeatScale 4.11915 seconds, 33728 calls, 122.129 microseconds average +softMaxFixed 3.22072 seconds, 17152 calls, 187.775 microseconds average +copyConvert 2.8333 seconds, 34880 calls, 81.2298 microseconds average +addRepeatEx 2.78075 seconds, 51168 calls, 54.3455 microseconds average +norm 2.76591 seconds, 51704 calls, 53.495 microseconds average +addRepeatGelu 2.35162 seconds, 17170 calls, 136.961 microseconds average +softMaxLong 2.24788 seconds, 527 calls, 4.26543 milliseconds average +softMax 2.21477 seconds, 16864 calls, 131.331 microseconds average +convolutionMain2Fixed 1.38064 seconds, 9 calls, 153.405 milliseconds average +addRepeat 1.30665 seconds, 17728 calls, 73.7057 microseconds average +scaleInPlace 1.10329 seconds, 17152 calls, 64.3245 microseconds average +diagMaskInf 937.457 milliseconds, 16864 calls, 55.5892 microseconds average +convolutionMain 374.967 milliseconds, 9 calls, 41.663 milliseconds average +convolutionPrep1 119.171 milliseconds, 18 calls, 6.62059 milliseconds average +convolutionPrep2 27.8894 milliseconds, 18 calls, 1.54941 milliseconds average +addRows 5.2536 milliseconds, 527 calls, 9.96888 microseconds average +add 2.8285 milliseconds, 9 calls, 314.278 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 92.2616 MB RAM, 1.20719 GB VRAM -Total 93.1333 MB RAM, 4.08869 GB VRAM +Context 92.2612 MB RAM, 1.14026 GB VRAM +Total 93.1329 MB RAM, 4.02176 GB VRAM diff --git a/SampleClips/columbia-large-vega7.txt b/SampleClips/columbia-large-vega7.txt index 836654e..bfd6107 100644 --- a/SampleClips/columbia-large-vega7.txt +++ b/SampleClips/columbia-large-vega7.txt @@ -1,46 +1,47 @@ CPU Tasks -LoadModel 3.44286 seconds -RunComplete 174.677 seconds -Run 174.601 seconds -Callbacks 22.604 milliseconds, 44 calls, 513.727 microseconds average -Spectrogram 1.65973 seconds, 41 calls, 40.4812 milliseconds average -Sample 148.233 milliseconds, 527 calls, 281.276 microseconds average -Encode 110.192 seconds, 9 calls, 12.2436 seconds average -Decode 64.3834 seconds, 9 calls, 7.15371 seconds average -DecodeStep 64.2344 seconds, 527 calls, 121.887 milliseconds average +LoadModel 2.88964 seconds +RunComplete 140.747 seconds +Run 140.661 seconds +Callbacks 20.302 milliseconds, 44 calls, 461.409 microseconds average +Spectrogram 468.419 milliseconds, 41 calls, 11.4249 milliseconds average +Sample 139.558 milliseconds, 527 calls, 264.815 microseconds average +Encode 87.5396 seconds, 9 calls, 9.72662 seconds average +Decode 53.0971 seconds, 9 calls, 5.89968 seconds average +DecodeStep 52.9566 seconds, 527 calls, 100.487 milliseconds average GPU Tasks -LoadModel 2.20374 seconds -Run 173.895 seconds -Encode 111.531 seconds, 9 calls, 12.3923 seconds average -EncodeLayer 96.2295 seconds, 288 calls, 334.13 milliseconds average -Decode 62.3642 seconds, 9 calls, 6.92936 seconds average -DecodeStep 62.3636 seconds, 527 calls, 118.337 milliseconds average -DecodeLayer 58.6225 seconds, 16864 calls, 3.47619 milliseconds average +LoadModel 1.86694 seconds +Run 140.175 seconds +Encode 88.7441 seconds, 9 calls, 9.86046 seconds average +EncodeLayer 75.809 seconds, 288 calls, 263.226 milliseconds average +Decode 51.4306 seconds, 9 calls, 5.71451 seconds average +DecodeStep 51.43 seconds, 527 calls, 97.5901 milliseconds average +DecodeLayer 48.1822 seconds, 16864 calls, 2.85711 milliseconds average Compute Shaders -mulMatTiledEx 89.3411 seconds, 2880 calls, 31.0212 milliseconds average -mulMatTiled 25.4265 seconds, 3465 calls, 7.33809 milliseconds average -mulMatByRowTiled 22.2805 seconds, 166278 calls, 133.995 microseconds average -mulMatByRowTiledEx 13.8414 seconds, 33152 calls, 417.514 microseconds average -softMaxFixed 3.90482 seconds, 17152 calls, 227.66 microseconds average -addRepeatGelu 2.52778 seconds, 17170 calls, 147.221 microseconds average -norm 2.10933 seconds, 51704 calls, 40.7962 microseconds average -convolutionMain2Fixed 2.06899 seconds, 9 calls, 229.888 milliseconds average -matReshapePanels 1.99444 seconds, 1737 calls, 1.14821 milliseconds average -addRepeat 1.84752 seconds, 68896 calls, 26.816 microseconds average -fmaRepeat1 1.28479 seconds, 51704 calls, 24.849 microseconds average -copyConvert 1.23617 seconds, 34880 calls, 35.4406 microseconds average -softMax 1.11773 seconds, 17391 calls, 64.2704 microseconds average -scaleInPlace 848.371 milliseconds, 17152 calls, 49.4619 microseconds average -copyTranspose 796.781 milliseconds, 34304 calls, 23.227 microseconds average -addInPlace 733.523 milliseconds, 34304 calls, 21.383 microseconds average -addRepeatScale 727.214 milliseconds, 33728 calls, 21.5611 microseconds average -convolutionMain 535.149 milliseconds, 9 calls, 59.461 milliseconds average -add 525.766 milliseconds, 16873 calls, 31.1602 microseconds average -diagMaskInf 361.151 milliseconds, 16864 calls, 21.4155 microseconds average -convolutionPrep1 58.0177 milliseconds, 18 calls, 3.22321 milliseconds average -convolutionPrep2 30.1294 milliseconds, 18 calls, 1.67386 milliseconds average -addRows 1.8544 milliseconds, 527 calls, 3.51879 microseconds average +mulMatTiledEx 69.1011 seconds, 2880 calls, 23.9934 milliseconds average +mulMatTiled 21.009 seconds, 3465 calls, 6.06321 milliseconds average +mulMatByRowTiled 20.0965 seconds, 166278 calls, 120.861 microseconds average +mulMatByRowTiledEx 9.61326 seconds, 33152 calls, 289.975 microseconds average +softMaxFixed 3.7631 seconds, 17152 calls, 219.397 microseconds average +norm 2.23806 seconds, 51704 calls, 43.2859 microseconds average +convolutionMain2Fixed 2.12825 seconds, 9 calls, 236.472 milliseconds average +matReshapePanels 2.0333 seconds, 1737 calls, 1.17058 milliseconds average +addRepeatGelu 1.5491 seconds, 17170 calls, 90.2211 microseconds average +scaleInPlace 1.32928 seconds, 17152 calls, 77.5001 microseconds average +copyConvert 1.23135 seconds, 34880 calls, 35.3026 microseconds average +fmaRepeat1 1.10337 seconds, 51704 calls, 21.3401 microseconds average +addRepeatEx 1.00095 seconds, 51168 calls, 19.562 microseconds average +copyTranspose 846.807 milliseconds, 34304 calls, 24.6854 microseconds average +addRepeat 704.028 milliseconds, 17728 calls, 39.7128 microseconds average +softMaxLong 608.58 milliseconds, 527 calls, 1.1548 milliseconds average +convolutionMain 522.249 milliseconds, 9 calls, 58.0277 milliseconds average +addRepeatScale 500.937 milliseconds, 33728 calls, 14.8523 microseconds average +softMax 236.054 milliseconds, 16864 calls, 13.9975 microseconds average +diagMaskInf 171.964 milliseconds, 16864 calls, 10.1971 microseconds average +convolutionPrep1 60.7331 milliseconds, 18 calls, 3.37406 milliseconds average +convolutionPrep2 33.441 milliseconds, 18 calls, 1.85783 milliseconds average +add 12.0883 milliseconds, 9 calls, 1.34314 milliseconds average +addRows 1.9724 milliseconds, 527 calls, 3.74269 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 92.2617 MB RAM, 1.27432 GB VRAM -Total 93.1334 MB RAM, 4.15582 GB VRAM +Context 92.2612 MB RAM, 1.19934 GB VRAM +Total 93.1329 MB RAM, 4.08084 GB VRAM diff --git a/SampleClips/columbia-large-vega8.txt b/SampleClips/columbia-large-vega8.txt index d7efb46..405956a 100644 --- a/SampleClips/columbia-large-vega8.txt +++ b/SampleClips/columbia-large-vega8.txt @@ -1,46 +1,47 @@  CPU Tasks -LoadModel 7.12 seconds -RunComplete 133.87 seconds -Run 133.812 seconds -Callbacks 14.3995 milliseconds, 44 calls, 327.261 microseconds average -Spectrogram 694.252 milliseconds, 41 calls, 16.933 milliseconds average -Sample 77.2903 milliseconds, 527 calls, 146.661 microseconds average -Encode 78.2421 seconds, 9 calls, 8.69357 seconds average -Decode 55.5528 seconds, 9 calls, 6.17254 seconds average -DecodeStep 55.4753 seconds, 527 calls, 105.266 milliseconds average +LoadModel 1.49776 seconds +RunComplete 110.474 seconds +Run 110.407 seconds +Callbacks 14.0412 milliseconds, 44 calls, 319.118 microseconds average +Spectrogram 201.605 milliseconds, 41 calls, 4.91719 milliseconds average +Sample 65.5117 milliseconds, 527 calls, 124.311 microseconds average +Encode 64.8806 seconds, 9 calls, 7.20896 seconds average +Decode 45.5097 seconds, 9 calls, 5.05663 seconds average +DecodeStep 45.4439 seconds, 527 calls, 86.2313 milliseconds average GPU Tasks -LoadModel 4.8933 seconds -Run 133.411 seconds -Encode 79.3112 seconds, 9 calls, 8.81235 seconds average -EncodeLayer 68.0212 seconds, 288 calls, 236.185 milliseconds average -Decode 54.0997 seconds, 9 calls, 6.01108 seconds average -DecodeStep 54.0993 seconds, 527 calls, 102.655 milliseconds average -DecodeLayer 51.0747 seconds, 16864 calls, 3.02862 milliseconds average +LoadModel 951.086 milliseconds +Run 110.123 seconds +Encode 65.7998 seconds, 9 calls, 7.31109 seconds average +EncodeLayer 56.2581 seconds, 288 calls, 195.341 milliseconds average +Decode 44.3232 seconds, 9 calls, 4.9248 seconds average +DecodeStep 44.3227 seconds, 527 calls, 84.1039 milliseconds average +DecodeLayer 41.6477 seconds, 16864 calls, 2.46962 milliseconds average Compute Shaders -mulMatTiledEx 63.7687 seconds, 2880 calls, 22.1419 milliseconds average -mulMatByRowTiled 19.983 seconds, 166278 calls, 120.178 microseconds average -mulMatTiled 18.3409 seconds, 3465 calls, 5.29318 milliseconds average -mulMatByRowTiledEx 12.2089 seconds, 33152 calls, 368.27 microseconds average -softMaxFixed 3.18364 seconds, 17152 calls, 185.613 microseconds average -norm 1.90119 seconds, 51704 calls, 36.7707 microseconds average -convolutionMain2Fixed 1.81408 seconds, 9 calls, 201.564 milliseconds average -addRepeat 1.66567 seconds, 68896 calls, 24.1765 microseconds average -matReshapePanels 1.53839 seconds, 1737 calls, 885.656 microseconds average -fmaRepeat1 1.24758 seconds, 51704 calls, 24.1292 microseconds average -addRepeatGelu 1.2376 seconds, 17170 calls, 72.0792 microseconds average -copyConvert 1.03588 seconds, 34880 calls, 29.6985 microseconds average -scaleInPlace 836.078 milliseconds, 17152 calls, 48.7452 microseconds average -softMax 788.79 milliseconds, 17391 calls, 45.3562 microseconds average -copyTranspose 740.928 milliseconds, 34304 calls, 21.5989 microseconds average -addRepeatScale 735.736 milliseconds, 33728 calls, 21.8138 microseconds average -addInPlace 707.443 milliseconds, 34304 calls, 20.6228 microseconds average -add 508.618 milliseconds, 16873 calls, 30.1439 microseconds average -diagMaskInf 383.247 milliseconds, 16864 calls, 22.7258 microseconds average -convolutionMain 372.899 milliseconds, 9 calls, 41.4332 milliseconds average -convolutionPrep1 44.6208 milliseconds, 18 calls, 2.47893 milliseconds average -convolutionPrep2 31.7478 milliseconds, 18 calls, 1.76377 milliseconds average -addRows 1.5923 milliseconds, 527 calls, 3.02144 microseconds average +mulMatTiledEx 51.4882 seconds, 2880 calls, 17.8778 milliseconds average +mulMatByRowTiled 17.0626 seconds, 166278 calls, 102.615 microseconds average +mulMatTiled 15.2984 seconds, 3465 calls, 4.41513 milliseconds average +mulMatByRowTiledEx 9.04106 seconds, 33152 calls, 272.715 microseconds average +softMaxFixed 3.04176 seconds, 17152 calls, 177.342 microseconds average +norm 1.98157 seconds, 51704 calls, 38.3254 microseconds average +convolutionMain2Fixed 1.8145 seconds, 9 calls, 201.611 milliseconds average +matReshapePanels 1.526 seconds, 1737 calls, 878.526 microseconds average +addRepeatGelu 1.24631 seconds, 17170 calls, 72.5866 microseconds average +scaleInPlace 1.23743 seconds, 17152 calls, 72.1448 microseconds average +copyConvert 1.02044 seconds, 34880 calls, 29.2557 microseconds average +fmaRepeat1 993.664 milliseconds, 51704 calls, 19.2183 microseconds average +addRepeatEx 953.85 milliseconds, 51168 calls, 18.6415 microseconds average +copyTranspose 705.073 milliseconds, 34304 calls, 20.5537 microseconds average +addRepeat 581.089 milliseconds, 17728 calls, 32.778 microseconds average +addRepeatScale 553.89 milliseconds, 33728 calls, 16.4222 microseconds average +softMaxLong 387.949 milliseconds, 527 calls, 736.147 microseconds average +convolutionMain 363.351 milliseconds, 9 calls, 40.3723 milliseconds average +softMax 242.956 milliseconds, 16864 calls, 14.4068 microseconds average +diagMaskInf 179.046 milliseconds, 16864 calls, 10.617 microseconds average +convolutionPrep1 45.2096 milliseconds, 18 calls, 2.51164 milliseconds average +convolutionPrep2 28.6853 milliseconds, 18 calls, 1.59363 milliseconds average +add 8.1107 milliseconds, 9 calls, 901.189 microseconds average +addRows 1.542 milliseconds, 527 calls, 2.926 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 92.2617 MB RAM, 1.27432 GB VRAM -Total 93.1334 MB RAM, 4.15582 GB VRAM +Context 92.2612 MB RAM, 1.19934 GB VRAM +Total 93.1329 MB RAM, 4.08084 GB VRAM diff --git a/SampleClips/columbia-medium-1080ti.txt b/SampleClips/columbia-medium-1080ti.txt index 4a6402a..1a4076f 100644 --- a/SampleClips/columbia-medium-1080ti.txt +++ b/SampleClips/columbia-medium-1080ti.txt @@ -1,43 +1,44 @@  CPU Tasks -LoadModel 766.119 milliseconds -RunComplete 19.7043 seconds -Run 19.5957 seconds -Callbacks 9.6164 milliseconds, 37 calls, 259.903 microseconds average -Spectrogram 720.672 milliseconds, 42 calls, 17.1589 milliseconds average -Sample 64.2796 milliseconds, 511 calls, 125.792 microseconds average -Encode 7.79098 seconds, 10 calls, 779.098 milliseconds average -Decode 11.7948 seconds, 10 calls, 1.17948 seconds average -DecodeStep 11.7302 seconds, 511 calls, 22.9555 milliseconds average +LoadModel 600.5 milliseconds +RunComplete 14.9475 seconds +Run 14.8676 seconds +Callbacks 8.0039 milliseconds, 37 calls, 216.322 microseconds average +Spectrogram 193.196 milliseconds, 42 calls, 4.59991 milliseconds average +Sample 52.0611 milliseconds, 511 calls, 101.881 microseconds average +Encode 5.97889 seconds, 10 calls, 597.889 milliseconds average +Decode 8.8778 seconds, 10 calls, 887.78 milliseconds average +DecodeStep 8.82556 seconds, 511 calls, 17.2712 milliseconds average GPU Tasks -LoadModel 611.184 milliseconds -Run 19.4034 seconds -Encode 7.70488 seconds, 10 calls, 770.488 milliseconds average -EncodeLayer 6.5897 seconds, 240 calls, 27.4571 milliseconds average -Decode 11.6985 seconds, 10 calls, 1.16985 seconds average -DecodeStep 11.6985 seconds, 511 calls, 22.8933 milliseconds average -DecodeLayer 10.6646 seconds, 12264 calls, 869.587 microseconds average +LoadModel 457.133 milliseconds +Run 14.7971 seconds +Encode 6.01034 seconds, 10 calls, 601.034 milliseconds average +EncodeLayer 5.11447 seconds, 240 calls, 21.3103 milliseconds average +Decode 8.78676 seconds, 10 calls, 878.676 milliseconds average +DecodeStep 8.78674 seconds, 511 calls, 17.1952 milliseconds average +DecodeLayer 8.10499 seconds, 12264 calls, 660.876 microseconds average Compute Shaders -mulMatTiled 8.16985 seconds, 5290 calls, 1.5444 milliseconds average -mulMatByRowTiled 6.60967 seconds, 144789 calls, 45.6503 microseconds average -softMax 797.261 milliseconds, 12775 calls, 62.4079 microseconds average -addRepeat 571.485 milliseconds, 50256 calls, 11.3715 microseconds average -fmaRepeat1 416.121 milliseconds, 37793 calls, 11.0105 microseconds average -normFixed 411.604 milliseconds, 37793 calls, 10.891 microseconds average -softMaxFixed 383.004 milliseconds, 12504 calls, 30.6305 microseconds average -copyConvert 373.59 milliseconds, 25488 calls, 14.6575 microseconds average -copyTranspose 337.831 milliseconds, 25008 calls, 13.5089 microseconds average -addRepeatScale 227.901 milliseconds, 24528 calls, 9.29146 microseconds average -addInPlace 226.48 milliseconds, 25008 calls, 9.05631 microseconds average -addRepeatGelu 215.091 milliseconds, 12524 calls, 17.1743 microseconds average -scaleInPlace 164.065 milliseconds, 12504 calls, 13.121 microseconds average -add 139.896 milliseconds, 12274 calls, 11.3978 microseconds average -convolutionMain2Fixed 129.329 milliseconds, 10 calls, 12.9329 milliseconds average -diagMaskInf 75.8229 milliseconds, 12264 calls, 6.18256 microseconds average -convolutionMain 70.7461 milliseconds, 10 calls, 7.07461 milliseconds average -convolutionPrep1 16.0788 milliseconds, 20 calls, 803.94 microseconds average -convolutionPrep2 5.4456 milliseconds, 20 calls, 272.28 microseconds average -addRows 4.1574 milliseconds, 511 calls, 8.13581 microseconds average +mulMatTiled 6.3857 seconds, 5290 calls, 1.20713 milliseconds average +mulMatByRowTiled 4.79001 seconds, 144789 calls, 33.0827 microseconds average +normFixed 417.279 milliseconds, 37793 calls, 11.0412 microseconds average +fmaRepeat1 399.385 milliseconds, 37793 calls, 10.5677 microseconds average +softMaxFixed 382.654 milliseconds, 12504 calls, 30.6025 microseconds average +addRepeatEx 378.135 milliseconds, 37272 calls, 10.1453 microseconds average +copyConvert 319.573 milliseconds, 25488 calls, 12.5382 microseconds average +copyTranspose 258.327 milliseconds, 25008 calls, 10.3298 microseconds average +softMaxLong 244.787 milliseconds, 511 calls, 479.035 microseconds average +addRepeatScale 223.995 milliseconds, 24528 calls, 9.13223 microseconds average +addRepeatGelu 181.428 milliseconds, 12524 calls, 14.4864 microseconds average +softMax 150.065 milliseconds, 12264 calls, 12.2362 microseconds average +scaleInPlace 147.891 milliseconds, 12504 calls, 11.8275 microseconds average +addRepeat 145.362 milliseconds, 12984 calls, 11.1955 microseconds average +convolutionMain2Fixed 124.04 milliseconds, 10 calls, 12.404 milliseconds average +diagMaskInf 78.8542 milliseconds, 12264 calls, 6.42973 microseconds average +convolutionMain 66.8723 milliseconds, 10 calls, 6.68723 milliseconds average +convolutionPrep1 15.4358 milliseconds, 20 calls, 771.79 microseconds average +convolutionPrep2 3.8144 milliseconds, 20 calls, 190.72 microseconds average +addRows 3.5214 milliseconds, 511 calls, 6.89119 microseconds average +add 929.8 microseconds, 10 calls, 92.98 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 91.0719 MB RAM, 841.634 MB VRAM -Total 91.9293 MB RAM, 2.24976 GB VRAM +Context 91.0716 MB RAM, 785.219 MB VRAM +Total 91.929 MB RAM, 2.19467 GB VRAM diff --git a/SampleClips/columbia-medium-1650.txt b/SampleClips/columbia-medium-1650.txt index 10d6984..1e3e879 100644 --- a/SampleClips/columbia-medium-1650.txt +++ b/SampleClips/columbia-medium-1650.txt @@ -1,43 +1,44 @@ CPU Tasks -LoadModel 818.374 milliseconds -RunComplete 55.336 seconds -Run 55.238 seconds -Callbacks 8.3113 milliseconds, 37 calls, 224.63 microseconds average -Spectrogram 1.11163 seconds, 42 calls, 26.4674 milliseconds average -Sample 59.2017 milliseconds, 511 calls, 115.855 microseconds average -Encode 33.7839 seconds, 10 calls, 3.37839 seconds average -Decode 21.4456 seconds, 10 calls, 2.14456 seconds average -DecodeStep 21.3862 seconds, 511 calls, 41.8517 milliseconds average +LoadModel 939.886 milliseconds +RunComplete 48.7479 seconds +Run 48.6305 seconds +Callbacks 10.5582 milliseconds, 37 calls, 285.357 microseconds average +Spectrogram 280.966 milliseconds, 42 calls, 6.68965 milliseconds average +Sample 65.5797 milliseconds, 511 calls, 128.336 microseconds average +Encode 19.0653 seconds, 10 calls, 1.90653 seconds average +Decode 29.5369 seconds, 10 calls, 2.95369 seconds average +DecodeStep 29.4709 seconds, 511 calls, 57.6731 milliseconds average GPU Tasks -LoadModel 626.222 milliseconds -Run 55.0407 seconds -Encode 34.044 seconds, 10 calls, 3.4044 seconds average -EncodeLayer 28.8064 seconds, 240 calls, 120.027 milliseconds average -Decode 20.9967 seconds, 10 calls, 2.09967 seconds average -DecodeStep 20.9967 seconds, 511 calls, 41.0894 milliseconds average -DecodeLayer 19.0732 seconds, 12264 calls, 1.55522 milliseconds average +LoadModel 586.498 milliseconds +Run 48.4589 seconds +Encode 19.2258 seconds, 10 calls, 1.92258 seconds average +EncodeLayer 15.7109 seconds, 240 calls, 65.4622 milliseconds average +Decode 29.233 seconds, 10 calls, 2.9233 seconds average +DecodeStep 29.233 seconds, 511 calls, 57.2074 milliseconds average +DecodeLayer 27.6558 seconds, 12264 calls, 2.25504 milliseconds average Compute Shaders -mulMatTiled 36.347 seconds, 5290 calls, 6.87089 milliseconds average -mulMatByRowTiled 12.1268 seconds, 144789 calls, 83.7549 microseconds average -convolutionMain2Fixed 956.94 milliseconds, 10 calls, 95.694 milliseconds average -softMaxFixed 878.266 milliseconds, 12504 calls, 70.2388 microseconds average -softMax 708.091 milliseconds, 12775 calls, 55.4279 microseconds average -addRepeat 648.271 milliseconds, 50256 calls, 12.8994 microseconds average -copyConvert 532.099 milliseconds, 25488 calls, 20.8764 microseconds average -copyTranspose 467.681 milliseconds, 25008 calls, 18.7013 microseconds average -normFixed 393.9 milliseconds, 37793 calls, 10.4226 microseconds average -addRepeatGelu 354.445 milliseconds, 12524 calls, 28.3013 microseconds average -fmaRepeat1 348.257 milliseconds, 37793 calls, 9.21484 microseconds average -addInPlace 308.862 milliseconds, 25008 calls, 12.3505 microseconds average -convolutionMain 278.894 milliseconds, 10 calls, 27.8894 milliseconds average -addRepeatScale 199.387 milliseconds, 24528 calls, 8.12898 microseconds average -scaleInPlace 197.51 milliseconds, 12504 calls, 15.7958 microseconds average -add 134.664 milliseconds, 12274 calls, 10.9715 microseconds average -diagMaskInf 57.9927 milliseconds, 12264 calls, 4.72869 microseconds average -convolutionPrep1 41.0155 milliseconds, 20 calls, 2.05077 milliseconds average -convolutionPrep2 8.0689 milliseconds, 20 calls, 403.445 microseconds average -addRows 3.1188 milliseconds, 511 calls, 6.10333 microseconds average +mulMatTiled 19.1596 seconds, 5290 calls, 3.62186 milliseconds average +mulMatByRowTiled 12.681 seconds, 144789 calls, 87.5829 microseconds average +fmaRepeat1 3.10945 seconds, 37793 calls, 82.2758 microseconds average +copyTranspose 2.83737 seconds, 25008 calls, 113.458 microseconds average +softMaxFixed 1.6294 seconds, 12504 calls, 130.31 microseconds average +addRepeatScale 1.54396 seconds, 24528 calls, 62.9467 microseconds average +addRepeatEx 1.06992 seconds, 37272 calls, 28.7056 microseconds average +normFixed 1.06753 seconds, 37793 calls, 28.2467 microseconds average +copyConvert 994.495 milliseconds, 25488 calls, 39.0182 microseconds average +convolutionMain2Fixed 954.715 milliseconds, 10 calls, 95.4715 milliseconds average +softMax 742.126 milliseconds, 12264 calls, 60.5126 microseconds average +addRepeatGelu 506.056 milliseconds, 12524 calls, 40.4069 microseconds average +softMaxLong 491.226 milliseconds, 511 calls, 961.303 microseconds average +scaleInPlace 438.956 milliseconds, 12504 calls, 35.1053 microseconds average +addRepeat 403.997 milliseconds, 12984 calls, 31.1149 microseconds average +diagMaskInf 366.713 milliseconds, 12264 calls, 29.9016 microseconds average +convolutionMain 276.364 milliseconds, 10 calls, 27.6364 milliseconds average +convolutionPrep1 44.9126 milliseconds, 20 calls, 2.24563 milliseconds average +convolutionPrep2 20.0013 milliseconds, 20 calls, 1.00006 milliseconds average +addRows 7.2369 milliseconds, 511 calls, 14.1622 microseconds average +add 2.453 milliseconds, 10 calls, 245.3 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 91.0719 MB RAM, 841.634 MB VRAM -Total 91.9293 MB RAM, 2.24976 GB VRAM +Context 91.0716 MB RAM, 785.219 MB VRAM +Total 91.929 MB RAM, 2.19467 GB VRAM diff --git a/SampleClips/columbia-medium-vega7.txt b/SampleClips/columbia-medium-vega7.txt index 06b3ad3..ad5173c 100644 --- a/SampleClips/columbia-medium-vega7.txt +++ b/SampleClips/columbia-medium-vega7.txt @@ -1,46 +1,47 @@ CPU Tasks -LoadModel 1.63669 seconds -RunComplete 97.4095 seconds -Run 97.3338 seconds -Callbacks 18.5655 milliseconds, 37 calls, 501.77 microseconds average -Spectrogram 1.4999 seconds, 42 calls, 35.7119 milliseconds average -Sample 135.736 milliseconds, 511 calls, 265.628 microseconds average -Encode 61.2992 seconds, 10 calls, 6.12992 seconds average -Decode 36.0131 seconds, 10 calls, 3.60131 seconds average -DecodeStep 35.8768 seconds, 511 calls, 70.2089 milliseconds average +LoadModel 1.99675 seconds +RunComplete 81.256 seconds +Run 81.1666 seconds +Callbacks 17.8976 milliseconds, 37 calls, 483.719 microseconds average +Spectrogram 483.273 milliseconds, 42 calls, 11.5065 milliseconds average +Sample 140.511 milliseconds, 511 calls, 274.972 microseconds average +Encode 50.3768 seconds, 10 calls, 5.03768 seconds average +Decode 30.7646 seconds, 10 calls, 3.07646 seconds average +DecodeStep 30.6234 seconds, 511 calls, 59.9284 milliseconds average GPU Tasks -LoadModel 875.606 milliseconds -Run 96.9497 seconds -Encode 62.3057 seconds, 10 calls, 6.23057 seconds average -EncodeLayer 53.632 seconds, 240 calls, 223.467 milliseconds average -Decode 34.644 seconds, 10 calls, 3.4644 seconds average -DecodeStep 34.6434 seconds, 511 calls, 67.7954 milliseconds average -DecodeLayer 31.2704 seconds, 12264 calls, 2.54977 milliseconds average +LoadModel 976.318 milliseconds +Run 80.8284 seconds +Encode 51.1656 seconds, 10 calls, 5.11656 seconds average +EncodeLayer 43.8924 seconds, 240 calls, 182.885 milliseconds average +Decode 29.6502 seconds, 10 calls, 2.96502 seconds average +DecodeStep 29.6441 seconds, 511 calls, 58.012 milliseconds average +DecodeLayer 26.9439 seconds, 12264 calls, 2.19699 milliseconds average Compute Shaders -mulMatTiledEx 46.2214 seconds, 2400 calls, 19.2589 milliseconds average -mulMatTiled 17.3476 seconds, 2890 calls, 6.00262 milliseconds average -mulMatByRowTiled 13.9489 seconds, 120741 calls, 115.527 microseconds average -mulMatByRowTiledEx 5.45206 seconds, 24048 calls, 226.716 microseconds average -softMaxFixed 2.49323 seconds, 12504 calls, 199.395 microseconds average -convolutionMain2Fixed 1.51065 seconds, 10 calls, 151.065 milliseconds average -matReshapePanels 1.26582 seconds, 1450 calls, 872.982 microseconds average -addRepeat 1.21062 seconds, 50256 calls, 24.0891 microseconds average -softMax 986.762 milliseconds, 12775 calls, 77.2417 microseconds average -addRepeatGelu 937.447 milliseconds, 12524 calls, 74.852 microseconds average -copyConvert 787.692 milliseconds, 25488 calls, 30.9044 microseconds average -fmaRepeat1 769.494 milliseconds, 37793 calls, 20.3608 microseconds average -normFixed 741.028 milliseconds, 37793 calls, 19.6076 microseconds average -addRepeatScale 600.233 milliseconds, 24528 calls, 24.4714 microseconds average -addInPlace 548.734 milliseconds, 25008 calls, 21.9423 microseconds average -scaleInPlace 489.186 milliseconds, 12504 calls, 39.1224 microseconds average -convolutionMain 469.994 milliseconds, 10 calls, 46.9994 milliseconds average -copyTranspose 452.957 milliseconds, 25008 calls, 18.1125 microseconds average -add 296.072 milliseconds, 12274 calls, 24.1219 microseconds average -diagMaskInf 194.708 milliseconds, 12264 calls, 15.8764 microseconds average -convolutionPrep2 43.5675 milliseconds, 20 calls, 2.17837 milliseconds average -convolutionPrep1 40.4517 milliseconds, 20 calls, 2.02258 milliseconds average -addRows 1.6846 milliseconds, 511 calls, 3.29667 microseconds average +mulMatTiledEx 37.1919 seconds, 2400 calls, 15.4966 milliseconds average +mulMatTiled 13.9953 seconds, 2890 calls, 4.84268 milliseconds average +mulMatByRowTiled 11.8792 seconds, 120741 calls, 98.3858 microseconds average +mulMatByRowTiledEx 4.47094 seconds, 24048 calls, 185.917 microseconds average +softMaxFixed 2.44162 seconds, 12504 calls, 195.267 microseconds average +convolutionMain2Fixed 1.51096 seconds, 10 calls, 151.096 milliseconds average +matReshapePanels 1.38964 seconds, 1450 calls, 958.371 microseconds average +addRepeatGelu 963.292 milliseconds, 12524 calls, 76.9157 microseconds average +normFixed 925.912 milliseconds, 37793 calls, 24.4996 microseconds average +copyConvert 875.162 milliseconds, 25488 calls, 34.3362 microseconds average +scaleInPlace 770.121 milliseconds, 12504 calls, 61.59 microseconds average +fmaRepeat1 696.227 milliseconds, 37793 calls, 18.4221 microseconds average +copyTranspose 657.921 milliseconds, 25008 calls, 26.3084 microseconds average +addRepeatEx 630.019 milliseconds, 37272 calls, 16.9033 microseconds average +softMaxLong 623.51 milliseconds, 511 calls, 1.22018 milliseconds average +convolutionMain 471.348 milliseconds, 10 calls, 47.1348 milliseconds average +addRepeatScale 379.836 milliseconds, 24528 calls, 15.4858 microseconds average +addRepeat 354.984 milliseconds, 12984 calls, 27.3401 microseconds average +softMax 197.387 milliseconds, 12264 calls, 16.0948 microseconds average +diagMaskInf 131.012 milliseconds, 12264 calls, 10.6827 microseconds average +convolutionPrep2 49.7619 milliseconds, 20 calls, 2.48809 milliseconds average +convolutionPrep1 42.2907 milliseconds, 20 calls, 2.11454 milliseconds average +add 10.5473 milliseconds, 10 calls, 1.05473 milliseconds average +addRows 2.1075 milliseconds, 511 calls, 4.12427 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 91.0721 MB RAM, 893.634 MB VRAM -Total 91.9295 MB RAM, 2.30054 GB VRAM +Context 91.0716 MB RAM, 833.407 MB VRAM +Total 91.929 MB RAM, 2.24172 GB VRAM diff --git a/SampleClips/columbia-medium-vega8.txt b/SampleClips/columbia-medium-vega8.txt index 8f81d2d..8b0e089 100644 --- a/SampleClips/columbia-medium-vega8.txt +++ b/SampleClips/columbia-medium-vega8.txt @@ -1,46 +1,47 @@  CPU Tasks -LoadModel 854.438 milliseconds -RunComplete 76.7069 seconds -Run 76.6544 seconds -Callbacks 10.1743 milliseconds, 37 calls, 274.981 microseconds average -Spectrogram 687.562 milliseconds, 42 calls, 16.3705 milliseconds average -Sample 73.812 milliseconds, 511 calls, 144.446 microseconds average -Encode 45.4051 seconds, 10 calls, 4.54051 seconds average -Decode 31.2366 seconds, 10 calls, 3.12366 seconds average -DecodeStep 31.1625 seconds, 511 calls, 60.9834 milliseconds average +LoadModel 841.605 milliseconds +RunComplete 62.1145 seconds +Run 62.0268 seconds +Callbacks 10.184 milliseconds, 37 calls, 275.243 microseconds average +Spectrogram 200.241 milliseconds, 42 calls, 4.76764 milliseconds average +Sample 63.0473 milliseconds, 511 calls, 123.38 microseconds average +Encode 37.2409 seconds, 10 calls, 3.72409 seconds average +Decode 24.7715 seconds, 10 calls, 2.47715 seconds average +DecodeStep 24.7082 seconds, 511 calls, 48.3526 milliseconds average GPU Tasks -LoadModel 473.758 milliseconds -Run 76.3483 seconds -Encode 46.1492 seconds, 10 calls, 4.61492 seconds average -EncodeLayer 39.5117 seconds, 240 calls, 164.632 milliseconds average -Decode 30.199 seconds, 10 calls, 3.0199 seconds average -DecodeStep 30.1986 seconds, 511 calls, 59.097 milliseconds average -DecodeLayer 27.4002 seconds, 12264 calls, 2.2342 milliseconds average +LoadModel 410.579 milliseconds +Run 61.8044 seconds +Encode 37.8702 seconds, 10 calls, 3.78702 seconds average +EncodeLayer 32.1896 seconds, 240 calls, 134.123 milliseconds average +Decode 23.9262 seconds, 10 calls, 2.39262 seconds average +DecodeStep 23.9233 seconds, 511 calls, 46.8167 milliseconds average +DecodeLayer 21.6888 seconds, 12264 calls, 1.76849 milliseconds average Compute Shaders -mulMatTiledEx 34.2075 seconds, 2400 calls, 14.2531 milliseconds average -mulMatTiled 12.4474 seconds, 2890 calls, 4.30705 milliseconds average -mulMatByRowTiled 11.7568 seconds, 120741 calls, 97.3723 microseconds average -mulMatByRowTiledEx 4.9317 seconds, 24048 calls, 205.077 microseconds average -softMaxFixed 2.26629 seconds, 12504 calls, 181.245 microseconds average -convolutionMain2Fixed 1.29108 seconds, 10 calls, 129.108 milliseconds average -addRepeat 1.11767 seconds, 50256 calls, 22.2396 microseconds average -matReshapePanels 1.06919 seconds, 1450 calls, 737.369 microseconds average -copyConvert 881.162 milliseconds, 25488 calls, 34.5717 microseconds average -addRepeatGelu 767.11 milliseconds, 12524 calls, 61.2512 microseconds average -softMax 759.06 milliseconds, 12775 calls, 59.4176 microseconds average -addRepeatScale 727.529 milliseconds, 24528 calls, 29.6612 microseconds average -addInPlace 695.233 milliseconds, 25008 calls, 27.8004 microseconds average -fmaRepeat1 608.837 milliseconds, 37793 calls, 16.1098 microseconds average -normFixed 574.785 milliseconds, 37793 calls, 15.2088 microseconds average -copyTranspose 559.208 milliseconds, 25008 calls, 22.3612 microseconds average -scaleInPlace 512.856 milliseconds, 12504 calls, 41.0153 microseconds average -convolutionMain 328.414 milliseconds, 10 calls, 32.8414 milliseconds average -add 254.621 milliseconds, 12274 calls, 20.7447 microseconds average -diagMaskInf 223.898 milliseconds, 12264 calls, 18.2565 microseconds average -convolutionPrep2 42.5244 milliseconds, 20 calls, 2.12622 milliseconds average -convolutionPrep1 31.1691 milliseconds, 20 calls, 1.55845 milliseconds average -addRows 1.4072 milliseconds, 511 calls, 2.75382 microseconds average +mulMatTiledEx 27.5216 seconds, 2400 calls, 11.4673 milliseconds average +mulMatTiled 10.2385 seconds, 2890 calls, 3.54273 milliseconds average +mulMatByRowTiled 9.38114 seconds, 120741 calls, 77.6964 microseconds average +mulMatByRowTiledEx 4.19991 seconds, 24048 calls, 174.647 microseconds average +softMaxFixed 1.95105 seconds, 12504 calls, 156.034 microseconds average +convolutionMain2Fixed 1.31354 seconds, 10 calls, 131.354 milliseconds average +matReshapePanels 1.04699 seconds, 1450 calls, 722.064 microseconds average +addRepeatGelu 777.683 milliseconds, 12524 calls, 62.0954 microseconds average +scaleInPlace 750.056 milliseconds, 12504 calls, 59.9853 microseconds average +copyConvert 701.517 milliseconds, 25488 calls, 27.5234 microseconds average +normFixed 697.931 milliseconds, 37793 calls, 18.4672 microseconds average +fmaRepeat1 529.007 milliseconds, 37793 calls, 13.9975 microseconds average +addRepeatEx 511.269 milliseconds, 37272 calls, 13.7172 microseconds average +copyTranspose 459.017 milliseconds, 25008 calls, 18.3548 microseconds average +softMaxLong 382.205 milliseconds, 511 calls, 747.955 microseconds average +convolutionMain 328.996 milliseconds, 10 calls, 32.8996 milliseconds average +addRepeat 305.31 milliseconds, 12984 calls, 23.5144 microseconds average +addRepeatScale 261.749 milliseconds, 24528 calls, 10.6715 microseconds average +softMax 148.894 milliseconds, 12264 calls, 12.1407 microseconds average +diagMaskInf 104.681 milliseconds, 12264 calls, 8.53562 microseconds average +convolutionPrep2 45.8033 milliseconds, 20 calls, 2.29017 milliseconds average +convolutionPrep1 32.3779 milliseconds, 20 calls, 1.61889 milliseconds average +add 7.3228 milliseconds, 10 calls, 732.28 microseconds average +addRows 1.6948 milliseconds, 511 calls, 3.31663 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 91.0721 MB RAM, 893.634 MB VRAM -Total 91.9295 MB RAM, 2.30054 GB VRAM +Context 91.0716 MB RAM, 833.407 MB VRAM +Total 91.929 MB RAM, 2.24172 GB VRAM diff --git a/SampleClips/jfk-large-1080ti.txt b/SampleClips/jfk-large-1080ti.txt index 6b963cb..9d46942 100644 --- a/SampleClips/jfk-large-1080ti.txt +++ b/SampleClips/jfk-large-1080ti.txt @@ -1,43 +1,44 @@  CPU Tasks -LoadModel 1.31643 seconds -RunComplete 2.62992 seconds -Run 2.55991 seconds -Callbacks 268.8 microseconds, 4 calls, 67.2 microseconds average -Spectrogram 41.7164 milliseconds, 3 calls, 13.9055 milliseconds average -Sample 3.7334 milliseconds, 27 calls, 138.274 microseconds average -Encode 1.59685 seconds -Decode 962.766 milliseconds -DecodeStep 959.004 milliseconds, 27 calls, 35.5187 milliseconds average +LoadModel 945.134 milliseconds +RunComplete 2.19628 seconds +Run 2.08991 seconds +Callbacks 762.3 microseconds, 4 calls, 190.575 microseconds average +Spectrogram 12.2602 milliseconds, 3 calls, 4.08673 milliseconds average +Sample 3.2495 milliseconds, 27 calls, 120.352 microseconds average +Encode 1.31469 seconds +Decode 774.432 milliseconds +DecodeStep 771.17 milliseconds, 27 calls, 28.5618 milliseconds average GPU Tasks -LoadModel 1.16929 seconds -Run 2.50813 seconds -Encode 1.54197 seconds -EncodeLayer 1.31304 seconds, 32 calls, 41.0324 milliseconds average -Decode 966.163 milliseconds -DecodeStep 966.159 milliseconds, 27 calls, 35.7837 milliseconds average -DecodeLayer 902.348 milliseconds, 864 calls, 1.04438 milliseconds average +LoadModel 803.014 milliseconds +Run 2.07007 seconds +Encode 1.29615 seconds +EncodeLayer 1.11858 seconds, 32 calls, 34.9556 milliseconds average +Decode 773.917 milliseconds +DecodeStep 773.915 milliseconds, 27 calls, 28.6635 milliseconds average +DecodeLayer 719.389 milliseconds, 864 calls, 832.626 microseconds average Compute Shaders -mulMatTiled 1.48565 seconds, 705 calls, 2.10731 milliseconds average -mulMatByRowTiled 597.295 milliseconds, 10010 calls, 59.6698 microseconds average -norm 73.0336 milliseconds, 2684 calls, 27.2107 microseconds average -addRepeat 53.7049 milliseconds, 3616 calls, 14.852 microseconds average -softMaxFixed 42.5443 milliseconds, 896 calls, 47.4825 microseconds average -softMax 42.278 milliseconds, 891 calls, 47.4501 microseconds average -fmaRepeat1 32.9186 milliseconds, 2684 calls, 12.2648 microseconds average -copyConvert 30.5182 milliseconds, 1856 calls, 16.443 microseconds average -copyTranspose 23.707 milliseconds, 1792 calls, 13.2294 microseconds average -convolutionMain2Fixed 20.2435 milliseconds -addInPlace 20.0419 milliseconds, 1792 calls, 11.1841 microseconds average -addRepeatGelu 19.4727 milliseconds, 898 calls, 21.6845 microseconds average -addRepeatScale 17.2226 milliseconds, 1728 calls, 9.96678 microseconds average -scaleInPlace 15.5358 milliseconds, 896 calls, 17.3391 microseconds average -add 11.4178 milliseconds, 865 calls, 13.1998 microseconds average -convolutionMain 8.7583 milliseconds -diagMaskInf 5.3196 milliseconds, 864 calls, 6.15694 microseconds average -convolutionPrep1 2.3276 milliseconds, 2 calls, 1.1638 milliseconds average -convolutionPrep2 572.4 microseconds, 2 calls, 286.2 microseconds average -addRows 207.9 microseconds, 27 calls, 7.7 microseconds average +mulMatTiled 1.20543 seconds, 705 calls, 1.70983 milliseconds average +mulMatByRowTiled 471.054 milliseconds, 10010 calls, 47.0584 microseconds average +norm 72.3989 milliseconds, 2684 calls, 26.9743 microseconds average +fmaRepeat1 41.7212 milliseconds, 2684 calls, 15.5444 microseconds average +copyTranspose 41.168 milliseconds, 1792 calls, 22.9732 microseconds average +softMaxFixed 40.8861 milliseconds, 896 calls, 45.6318 microseconds average +addRepeatEx 30.1243 milliseconds, 2656 calls, 11.342 microseconds average +copyConvert 28.9217 milliseconds, 1856 calls, 15.5828 microseconds average +softMaxLong 25.3209 milliseconds, 27 calls, 937.811 microseconds average +convolutionMain2Fixed 19.8769 milliseconds +addRepeatScale 18.2236 milliseconds, 1728 calls, 10.5461 microseconds average +addRepeatGelu 15.7554 milliseconds, 898 calls, 17.545 microseconds average +addRepeat 14.2968 milliseconds, 960 calls, 14.8925 microseconds average +scaleInPlace 13.9332 milliseconds, 896 calls, 15.5504 microseconds average +softMax 8.5928 milliseconds, 864 calls, 9.94537 microseconds average +convolutionMain 8.532 milliseconds +diagMaskInf 5.6745 milliseconds, 864 calls, 6.56771 microseconds average +convolutionPrep1 2.303 milliseconds, 2 calls, 1.1515 milliseconds average +convolutionPrep2 422.9 microseconds, 2 calls, 211.45 microseconds average +addRows 198.7 microseconds, 27 calls, 7.35926 microseconds average +add 119.8 microseconds Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 1.98413 MB RAM, 1.07361 GB VRAM -Total 2.8558 MB RAM, 3.95511 GB VRAM +Context 1.98376 MB RAM, 1.07641 GB VRAM +Total 2.85543 MB RAM, 3.95791 GB VRAM diff --git a/SampleClips/jfk-large-1650.txt b/SampleClips/jfk-large-1650.txt index 9c6e4b8..e11694a 100644 --- a/SampleClips/jfk-large-1650.txt +++ b/SampleClips/jfk-large-1650.txt @@ -1,43 +1,44 @@ CPU Tasks -LoadModel 1.4018 seconds -RunComplete 8.71063 seconds -Run 8.64303 seconds -Callbacks 251.9 microseconds, 4 calls, 62.975 microseconds average -Spectrogram 62.1203 milliseconds, 3 calls, 20.7068 milliseconds average -Sample 3.5493 milliseconds, 27 calls, 131.456 microseconds average -Encode 6.90879 seconds -Decode 1.73396 seconds -DecodeStep 1.73039 seconds, 27 calls, 64.0887 milliseconds average +LoadModel 7.92578 seconds +RunComplete 8.33686 seconds +Run 8.25683 seconds +Callbacks 337.7 microseconds, 4 calls, 84.425 microseconds average +Spectrogram 16.4214 milliseconds, 3 calls, 5.4738 milliseconds average +Sample 3.8768 milliseconds, 27 calls, 143.585 microseconds average +Encode 4.14309 seconds +Decode 4.11338 seconds +DecodeStep 4.10947 seconds, 27 calls, 152.203 milliseconds average GPU Tasks -LoadModel 1.20907 seconds -Run 8.4523 seconds -Encode 6.83046 seconds -EncodeLayer 5.71692 seconds, 32 calls, 178.654 milliseconds average -Decode 1.62184 seconds -DecodeStep 1.62184 seconds, 27 calls, 60.068 milliseconds average -DecodeLayer 1.51049 seconds, 864 calls, 1.74825 milliseconds average +LoadModel 7.53025 seconds +Run 8.05464 seconds +Encode 3.98133 seconds +EncodeLayer 3.29696 seconds, 32 calls, 103.03 milliseconds average +Decode 4.07331 seconds +DecodeStep 4.07331 seconds, 27 calls, 150.863 milliseconds average +DecodeLayer 3.81856 seconds, 864 calls, 4.41963 milliseconds average Compute Shaders -mulMatTiled 6.39268 seconds, 705 calls, 9.06763 milliseconds average -mulMatByRowTiled 1.09505 seconds, 10010 calls, 109.395 microseconds average -convolutionMain2Fixed 155.164 milliseconds -convolutionMain 123.525 milliseconds -softMaxFixed 120.173 milliseconds, 896 calls, 134.122 microseconds average -norm 84.1752 milliseconds, 2684 calls, 31.3618 microseconds average -copyConvert 78.0956 milliseconds, 1856 calls, 42.0774 microseconds average -addRepeat 63.3192 milliseconds, 3616 calls, 17.5108 microseconds average -fmaRepeat1 56.6908 milliseconds, 2684 calls, 21.1218 microseconds average -softMax 54.6717 milliseconds, 891 calls, 61.3599 microseconds average -addInPlace 39.7892 milliseconds, 1792 calls, 22.2038 microseconds average -copyTranspose 38.8897 milliseconds, 1792 calls, 21.7018 microseconds average -addRepeatGelu 34.762 milliseconds, 898 calls, 38.7105 microseconds average -add 33.3001 milliseconds, 865 calls, 38.4972 microseconds average -scaleInPlace 24.343 milliseconds, 896 calls, 27.1685 microseconds average -addRepeatScale 18.8872 milliseconds, 1728 calls, 10.9301 microseconds average -convolutionPrep1 7.8052 milliseconds, 2 calls, 3.9026 milliseconds average -diagMaskInf 4.1647 milliseconds, 864 calls, 4.82025 microseconds average -convolutionPrep2 1.209 milliseconds, 2 calls, 604.5 microseconds average -addRows 183.6 microseconds, 27 calls, 6.8 microseconds average +mulMatTiled 3.6307 seconds, 705 calls, 5.14993 milliseconds average +mulMatByRowTiled 1.45034 seconds, 10010 calls, 144.889 microseconds average +fmaRepeat1 774.011 milliseconds, 2684 calls, 288.38 microseconds average +copyTranspose 416.996 milliseconds, 1792 calls, 232.699 microseconds average +addRepeatScale 223.798 milliseconds, 1728 calls, 129.513 microseconds average +norm 211.821 milliseconds, 2684 calls, 78.9197 microseconds average +softMaxFixed 209.124 milliseconds, 896 calls, 233.398 microseconds average +copyConvert 186.898 milliseconds, 1856 calls, 100.699 microseconds average +addRepeatEx 176.985 milliseconds, 2656 calls, 66.6361 microseconds average +softMaxLong 162.482 milliseconds, 27 calls, 6.01787 milliseconds average +convolutionMain2Fixed 154.739 milliseconds +addRepeatGelu 132.088 milliseconds, 898 calls, 147.091 microseconds average +softMax 98.1905 milliseconds, 864 calls, 113.646 microseconds average +scaleInPlace 57.3956 milliseconds, 896 calls, 64.0576 microseconds average +convolutionMain 45.7933 milliseconds +addRepeat 36.9315 milliseconds, 960 calls, 38.4703 microseconds average +diagMaskInf 28.2987 milliseconds, 864 calls, 32.7531 microseconds average +convolutionPrep1 9.0334 milliseconds, 2 calls, 4.5167 milliseconds average +convolutionPrep2 1.1608 milliseconds, 2 calls, 580.4 microseconds average +add 320 microseconds +addRows 244 microseconds, 27 calls, 9.03704 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 1.98413 MB RAM, 1.07361 GB VRAM -Total 2.8558 MB RAM, 3.95511 GB VRAM +Context 1.98376 MB RAM, 1.07641 GB VRAM +Total 2.85543 MB RAM, 3.95791 GB VRAM diff --git a/SampleClips/jfk-large-vega7.txt b/SampleClips/jfk-large-vega7.txt index 712c626..780593a 100644 --- a/SampleClips/jfk-large-vega7.txt +++ b/SampleClips/jfk-large-vega7.txt @@ -1,46 +1,47 @@ CPU Tasks -LoadModel 2.48295 seconds -RunComplete 19.41 seconds -Run 19.325 seconds -Callbacks 938.5 microseconds, 4 calls, 234.625 microseconds average -Spectrogram 101.776 milliseconds, 3 calls, 33.9253 milliseconds average -Sample 7.4609 milliseconds, 27 calls, 276.33 microseconds average -Encode 16.6219 seconds -Decode 2.7018 seconds -DecodeStep 2.69429 seconds, 27 calls, 99.7886 milliseconds average +LoadModel 3.22847 seconds +RunComplete 14.2729 seconds +Run 14.186 seconds +Callbacks 674.7 microseconds, 4 calls, 168.675 microseconds average +Spectrogram 29.6112 milliseconds, 3 calls, 9.8704 milliseconds average +Sample 7.7473 milliseconds, 27 calls, 286.937 microseconds average +Encode 11.8931 seconds +Decode 2.29185 seconds +DecodeStep 2.28406 seconds, 27 calls, 84.5949 milliseconds average GPU Tasks -LoadModel 1.59925 seconds -Run 19.1489 seconds -Encode 16.6535 seconds -EncodeLayer 14.7506 seconds, 32 calls, 460.957 milliseconds average -Decode 2.4954 seconds -DecodeStep 2.49537 seconds, 27 calls, 92.4212 milliseconds average -DecodeLayer 2.33892 seconds, 864 calls, 2.70708 milliseconds average +LoadModel 1.9083 seconds +Run 14.0698 seconds +Encode 11.9404 seconds +EncodeLayer 10.2786 seconds, 32 calls, 321.205 milliseconds average +Decode 2.12941 seconds +DecodeStep 2.12938 seconds, 27 calls, 78.8661 milliseconds average +DecodeLayer 1.98655 seconds, 864 calls, 2.29924 milliseconds average Compute Shaders -mulMatTiledEx 10.8399 seconds, 320 calls, 33.8745 milliseconds average -mulMatTiled 2.40696 seconds, 385 calls, 6.25184 milliseconds average -norm 1.1565 seconds, 2684 calls, 430.885 microseconds average -addRepeatGelu 1.1138 seconds, 898 calls, 1.24031 milliseconds average -mulMatByRowTiled 1.08614 seconds, 8346 calls, 130.14 microseconds average -mulMatByRowTiledEx 692.772 milliseconds, 1664 calls, 416.329 microseconds average -softMaxFixed 416.688 milliseconds, 896 calls, 465.053 microseconds average -convolutionMain2Fixed 415.361 milliseconds -matReshapePanels 179.52 milliseconds, 193 calls, 930.155 microseconds average -addRepeat 171.572 milliseconds, 3616 calls, 47.4479 microseconds average -convolutionMain 126.679 milliseconds -copyConvert 95.046 milliseconds, 1856 calls, 51.2101 microseconds average -fmaRepeat1 74.6558 milliseconds, 2684 calls, 27.8151 microseconds average -copyTranspose 73.589 milliseconds, 1792 calls, 41.0653 microseconds average -addInPlace 67.0819 milliseconds, 1792 calls, 37.4341 microseconds average -scaleInPlace 66.1625 milliseconds, 896 calls, 73.8421 microseconds average -softMax 65.629 milliseconds, 891 calls, 73.6577 microseconds average -addRepeatScale 29.3899 milliseconds, 1728 calls, 17.008 microseconds average -add 25.2651 milliseconds, 865 calls, 29.2082 microseconds average -convolutionPrep1 13.325 milliseconds, 2 calls, 6.6625 milliseconds average -diagMaskInf 11.2047 milliseconds, 864 calls, 12.9684 microseconds average -convolutionPrep2 5.9717 milliseconds, 2 calls, 2.98585 milliseconds average -addRows 93.7 microseconds, 27 calls, 3.47037 microseconds average +mulMatTiledEx 8.49883 seconds, 320 calls, 26.5589 milliseconds average +mulMatTiled 2.04655 seconds, 385 calls, 5.31573 milliseconds average +mulMatByRowTiled 982.48 milliseconds, 8346 calls, 117.719 microseconds average +mulMatByRowTiledEx 481.123 milliseconds, 1664 calls, 289.137 microseconds average +convolutionMain2Fixed 415.244 milliseconds +softMaxFixed 404.223 milliseconds, 896 calls, 451.142 microseconds average +matReshapePanels 210.915 milliseconds, 193 calls, 1.09282 milliseconds average +norm 154.589 milliseconds, 2684 calls, 57.5965 microseconds average +convolutionMain 126.883 milliseconds +addRepeatGelu 112.131 milliseconds, 898 calls, 124.867 microseconds average +copyConvert 100.589 milliseconds, 1856 calls, 54.1968 microseconds average +scaleInPlace 91.3539 milliseconds, 896 calls, 101.957 microseconds average +fmaRepeat1 86.7731 milliseconds, 2684 calls, 32.3298 microseconds average +copyTranspose 77.5852 milliseconds, 1792 calls, 43.2953 microseconds average +addRepeat 76.3677 milliseconds, 960 calls, 79.5497 microseconds average +addRepeatEx 70.8699 milliseconds, 2656 calls, 26.6829 microseconds average +softMaxLong 39.4035 milliseconds, 27 calls, 1.45939 milliseconds average +addRepeatScale 25.0842 milliseconds, 1728 calls, 14.5163 microseconds average +softMax 14.0555 milliseconds, 864 calls, 16.2679 microseconds average +convolutionPrep1 13.9331 milliseconds, 2 calls, 6.96655 milliseconds average +diagMaskInf 8.1717 milliseconds, 864 calls, 9.45799 microseconds average +convolutionPrep2 5.2098 milliseconds, 2 calls, 2.6049 milliseconds average +add 2.9724 milliseconds +addRows 91.2 microseconds, 27 calls, 3.37778 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 1.98427 MB RAM, 1.13175 GB VRAM -Total 2.85594 MB RAM, 4.01325 GB VRAM +Context 1.98376 MB RAM, 1.13447 GB VRAM +Total 2.85543 MB RAM, 4.01597 GB VRAM diff --git a/SampleClips/jfk-large-vega8.txt b/SampleClips/jfk-large-vega8.txt index 2e488c9..de7ae1c 100644 --- a/SampleClips/jfk-large-vega8.txt +++ b/SampleClips/jfk-large-vega8.txt @@ -1,46 +1,47 @@  CPU Tasks -LoadModel 1.61442 seconds -RunComplete 11.1008 seconds -Run 11.0526 seconds -Callbacks 364.6 microseconds, 4 calls, 91.15 microseconds average -Spectrogram 42.5458 milliseconds, 3 calls, 14.1819 milliseconds average -Sample 4.1022 milliseconds, 27 calls, 151.933 microseconds average -Encode 8.77118 seconds -Decode 2.28079 seconds -DecodeStep 2.27667 seconds, 27 calls, 84.3212 milliseconds average +LoadModel 1.57347 seconds +RunComplete 9.46787 seconds +Run 9.40671 seconds +Callbacks 292.1 microseconds, 4 calls, 73.025 microseconds average +Spectrogram 12.2725 milliseconds, 3 calls, 4.09083 milliseconds average +Sample 3.5692 milliseconds, 27 calls, 132.193 microseconds average +Encode 7.26322 seconds +Decode 2.14291 seconds +DecodeStep 2.13933 seconds, 27 calls, 79.2343 milliseconds average GPU Tasks -LoadModel 1.04929 seconds -Run 10.9677 seconds -Encode 8.85998 seconds -EncodeLayer 7.57052 seconds, 32 calls, 236.579 milliseconds average -Decode 2.10771 seconds -DecodeStep 2.10769 seconds, 27 calls, 78.0627 milliseconds average -DecodeLayer 1.99148 seconds, 864 calls, 2.30495 milliseconds average +LoadModel 991.351 milliseconds +Run 9.36883 seconds +Encode 7.35144 seconds +EncodeLayer 6.25822 seconds, 32 calls, 195.569 milliseconds average +Decode 2.01739 seconds +DecodeStep 2.01737 seconds, 27 calls, 74.7173 milliseconds average +DecodeLayer 1.88943 seconds, 864 calls, 2.18684 milliseconds average Compute Shaders -mulMatTiledEx 6.65327 seconds, 320 calls, 20.7915 milliseconds average -mulMatTiled 1.39618 seconds, 385 calls, 3.62644 milliseconds average -mulMatByRowTiled 962.672 milliseconds, 8346 calls, 115.345 microseconds average -mulMatByRowTiledEx 610.853 milliseconds, 1664 calls, 367.099 microseconds average -softMaxFixed 282.392 milliseconds, 896 calls, 315.17 microseconds average -convolutionMain2Fixed 201.356 milliseconds -addRepeat 132.117 milliseconds, 3616 calls, 36.5368 microseconds average -matReshapePanels 125.575 milliseconds, 193 calls, 650.649 microseconds average -norm 118.464 milliseconds, 2684 calls, 44.1371 microseconds average -addRepeatGelu 86.6617 milliseconds, 898 calls, 96.5052 microseconds average -copyConvert 69.9066 milliseconds, 1856 calls, 37.6652 microseconds average -fmaRepeat1 53.2854 milliseconds, 2684 calls, 19.853 microseconds average -scaleInPlace 51.3726 milliseconds, 896 calls, 57.3355 microseconds average -copyTranspose 47.1209 milliseconds, 1792 calls, 26.2951 microseconds average -addInPlace 45.977 milliseconds, 1792 calls, 25.6568 microseconds average -convolutionMain 39.939 milliseconds -softMax 32.0882 milliseconds, 891 calls, 36.0137 microseconds average -addRepeatScale 19.924 milliseconds, 1728 calls, 11.5301 microseconds average -add 16.1369 milliseconds, 865 calls, 18.6554 microseconds average -diagMaskInf 6.8347 milliseconds, 864 calls, 7.91053 microseconds average -convolutionPrep1 4.9909 milliseconds, 2 calls, 2.49545 milliseconds average -convolutionPrep2 2.6408 milliseconds, 2 calls, 1.3204 milliseconds average -addRows 70.9 microseconds, 27 calls, 2.62593 microseconds average +mulMatTiledEx 5.37127 seconds, 320 calls, 16.7852 milliseconds average +mulMatTiled 1.17596 seconds, 385 calls, 3.05444 milliseconds average +mulMatByRowTiled 878.04 milliseconds, 8346 calls, 105.205 microseconds average +mulMatByRowTiledEx 460.074 milliseconds, 1664 calls, 276.486 microseconds average +softMaxFixed 288.221 milliseconds, 896 calls, 321.675 microseconds average +convolutionMain2Fixed 201.063 milliseconds +norm 141.073 milliseconds, 2684 calls, 52.5606 microseconds average +matReshapePanels 138.851 milliseconds, 193 calls, 719.436 microseconds average +addRepeatGelu 89.1783 milliseconds, 898 calls, 99.3077 microseconds average +copyConvert 83.2232 milliseconds, 1856 calls, 44.8401 microseconds average +scaleInPlace 77.8363 milliseconds, 896 calls, 86.8709 microseconds average +fmaRepeat1 77.8123 milliseconds, 2684 calls, 28.9912 microseconds average +addRepeatEx 76.9018 milliseconds, 2656 calls, 28.954 microseconds average +addRepeat 66.8479 milliseconds, 960 calls, 69.6332 microseconds average +copyTranspose 62.5101 milliseconds, 1792 calls, 34.8829 microseconds average +addRepeatScale 40.5807 milliseconds, 1728 calls, 23.4842 microseconds average +convolutionMain 39.8186 milliseconds +softMaxLong 32.0594 milliseconds, 27 calls, 1.18739 milliseconds average +softMax 15.9281 milliseconds, 864 calls, 18.4353 microseconds average +diagMaskInf 12.6164 milliseconds, 864 calls, 14.6023 microseconds average +convolutionPrep1 5.4486 milliseconds, 2 calls, 2.7243 milliseconds average +convolutionPrep2 4.0996 milliseconds, 2 calls, 2.0498 milliseconds average +add 883.4 microseconds +addRows 73.4 microseconds, 27 calls, 2.71852 microseconds average Memory Usage Model 892.591 KB RAM, 2.8815 GB VRAM -Context 1.98427 MB RAM, 1.13175 GB VRAM -Total 2.85594 MB RAM, 4.01325 GB VRAM +Context 1.98376 MB RAM, 1.13447 GB VRAM +Total 2.85543 MB RAM, 4.01597 GB VRAM diff --git a/SampleClips/jfk-medium-1080ti.txt b/SampleClips/jfk-medium-1080ti.txt index f76376d..ffb053f 100644 --- a/SampleClips/jfk-medium-1080ti.txt +++ b/SampleClips/jfk-medium-1080ti.txt @@ -1,43 +1,44 @@  CPU Tasks -LoadModel 751.527 milliseconds -RunComplete 1.46731 seconds -Run 1.39689 seconds -Callbacks 319.7 microseconds, 4 calls, 79.925 microseconds average -Spectrogram 40.711 milliseconds, 3 calls, 13.5703 milliseconds average -Sample 3.6208 milliseconds, 28 calls, 129.314 microseconds average -Encode 803.503 milliseconds -Decode 593.049 milliseconds -DecodeStep 589.41 milliseconds, 28 calls, 21.0504 milliseconds average +LoadModel 593.861 milliseconds +RunComplete 1.13909 seconds +Run 1.06578 seconds +Callbacks 279.4 microseconds, 4 calls, 69.85 microseconds average +Spectrogram 12.0744 milliseconds, 3 calls, 4.0248 milliseconds average +Sample 3.0016 milliseconds, 28 calls, 107.2 microseconds average +Encode 614.44 milliseconds +Decode 451.048 milliseconds +DecodeStep 448.036 milliseconds, 28 calls, 16.0013 milliseconds average GPU Tasks -LoadModel 597.603 milliseconds -Run 1.34198 seconds -Encode 754.654 milliseconds -EncodeLayer 645.794 milliseconds, 24 calls, 26.9081 milliseconds average -Decode 587.324 milliseconds -DecodeStep 587.321 milliseconds, 28 calls, 20.9758 milliseconds average -DecodeLayer 543.185 milliseconds, 672 calls, 808.311 microseconds average +LoadModel 452.128 milliseconds +Run 1.0518 seconds +Encode 599.964 milliseconds +EncodeLayer 506.67 milliseconds, 24 calls, 21.1113 milliseconds average +Decode 451.832 milliseconds +DecodeStep 451.828 milliseconds, 28 calls, 16.1367 milliseconds average +DecodeLayer 412.142 milliseconds, 672 calls, 613.307 microseconds average Compute Shaders -mulMatTiled 723.882 milliseconds, 529 calls, 1.3684 milliseconds average -mulMatByRowTiled 346.71 milliseconds, 7803 calls, 44.4329 microseconds average -softMax 39.6096 milliseconds, 700 calls, 56.5851 microseconds average -addRepeat 31.2462 milliseconds, 2808 calls, 11.1276 microseconds average -softMaxFixed 27.2224 milliseconds, 696 calls, 39.1126 microseconds average -normFixed 24.8054 milliseconds, 2093 calls, 11.8516 microseconds average -fmaRepeat1 24.7513 milliseconds, 2093 calls, 11.8258 microseconds average -copyConvert 19.778 milliseconds, 1440 calls, 13.7347 microseconds average -copyTranspose 18.6921 milliseconds, 1392 calls, 13.4282 microseconds average -addRepeatScale 13.4873 milliseconds, 1344 calls, 10.0352 microseconds average -addInPlace 13.0325 milliseconds, 1392 calls, 9.36243 microseconds average -convolutionMain2Fixed 12.33 milliseconds -addRepeatGelu 12.1985 milliseconds, 698 calls, 17.4764 microseconds average -scaleInPlace 10.3726 milliseconds, 696 calls, 14.9032 microseconds average -add 8.0935 milliseconds, 673 calls, 12.026 microseconds average -convolutionMain 6.6079 milliseconds -diagMaskInf 3.9483 milliseconds, 672 calls, 5.87545 microseconds average -convolutionPrep1 1.5073 milliseconds, 2 calls, 753.65 microseconds average -convolutionPrep2 540.7 microseconds, 2 calls, 270.35 microseconds average -addRows 204.8 microseconds, 28 calls, 7.31429 microseconds average +mulMatTiled 562.478 milliseconds, 529 calls, 1.06329 milliseconds average +mulMatByRowTiled 256.062 milliseconds, 7803 calls, 32.8158 microseconds average +softMaxFixed 27.1687 milliseconds, 696 calls, 39.0355 microseconds average +normFixed 24.1828 milliseconds, 2093 calls, 11.5541 microseconds average +fmaRepeat1 23.3089 milliseconds, 2093 calls, 11.1366 microseconds average +addRepeatEx 22.3395 milliseconds, 2064 calls, 10.8234 microseconds average +softMaxLong 19.7192 milliseconds, 28 calls, 704.257 microseconds average +copyConvert 19.301 milliseconds, 1440 calls, 13.4035 microseconds average +copyTranspose 15.3011 milliseconds, 1392 calls, 10.9922 microseconds average +addRepeatScale 13.6043 milliseconds, 1344 calls, 10.1222 microseconds average +convolutionMain2Fixed 12.1242 milliseconds +addRepeatGelu 11.6172 milliseconds, 698 calls, 16.6436 microseconds average +addRepeat 11.5331 milliseconds, 744 calls, 15.5015 microseconds average +scaleInPlace 9.5743 milliseconds, 696 calls, 13.7562 microseconds average +convolutionMain 7.0349 milliseconds +softMax 5.8329 milliseconds, 672 calls, 8.67991 microseconds average +diagMaskInf 4.5297 milliseconds, 672 calls, 6.74062 microseconds average +convolutionPrep1 1.5258 milliseconds, 2 calls, 762.9 microseconds average +convolutionPrep2 383 microseconds, 2 calls, 191.5 microseconds average +addRows 194.6 microseconds, 28 calls, 6.95 microseconds average +add 95.2 microseconds Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 1.98347 MB RAM, 723.729 MB VRAM -Total 2.84085 MB RAM, 2.13462 GB VRAM +Context 1.9831 MB RAM, 723.782 MB VRAM +Total 2.84049 MB RAM, 2.13467 GB VRAM diff --git a/SampleClips/jfk-medium-1650.txt b/SampleClips/jfk-medium-1650.txt index b072607..baa9097 100644 --- a/SampleClips/jfk-medium-1650.txt +++ b/SampleClips/jfk-medium-1650.txt @@ -1,43 +1,44 @@ CPU Tasks -LoadModel 818.309 milliseconds -RunComplete 4.59853 seconds -Run 4.51124 seconds -Callbacks 259.1 microseconds, 4 calls, 64.775 microseconds average -Spectrogram 62.0087 milliseconds, 3 calls, 20.6696 milliseconds average -Sample 3.3139 milliseconds, 28 calls, 118.354 microseconds average -Encode 3.54162 seconds -Decode 969.342 milliseconds -DecodeStep 966.005 milliseconds, 28 calls, 34.5002 milliseconds average +LoadModel 2.20693 seconds +RunComplete 3.16174 seconds +Run 3.07912 seconds +Callbacks 387.3 microseconds, 4 calls, 96.825 microseconds average +Spectrogram 16.201 milliseconds, 3 calls, 5.40033 milliseconds average +Sample 3.3725 milliseconds, 28 calls, 120.446 microseconds average +Encode 2.07037 seconds +Decode 1.00834 seconds +DecodeStep 1.00495 seconds, 28 calls, 35.8911 milliseconds average GPU Tasks -LoadModel 623.002 milliseconds -Run 4.38954 seconds -Encode 3.46286 seconds -EncodeLayer 2.86548 seconds, 24 calls, 119.395 milliseconds average -Decode 926.677 milliseconds -DecodeStep 926.674 milliseconds, 28 calls, 33.0955 milliseconds average -DecodeLayer 843.963 milliseconds, 672 calls, 1.2559 milliseconds average +LoadModel 1.81217 seconds +Run 2.94117 seconds +Encode 1.95373 seconds +EncodeLayer 1.56747 seconds, 24 calls, 65.3115 milliseconds average +Decode 987.441 milliseconds +DecodeStep 987.44 milliseconds, 28 calls, 35.2657 milliseconds average +DecodeLayer 915.401 milliseconds, 672 calls, 1.3622 milliseconds average Compute Shaders -mulMatTiled 3.19154 seconds, 529 calls, 6.03316 milliseconds average -mulMatByRowTiled 628.359 milliseconds, 7803 calls, 80.5278 microseconds average -convolutionMain2Fixed 98.3757 milliseconds -convolutionMain 95.2955 milliseconds -softMaxFixed 73.4031 milliseconds, 696 calls, 105.464 microseconds average -addRepeat 58.0541 milliseconds, 2808 calls, 20.6745 microseconds average -copyConvert 42.8539 milliseconds, 1440 calls, 29.7597 microseconds average -softMax 37.7754 milliseconds, 700 calls, 53.9649 microseconds average -normFixed 25.4389 milliseconds, 2093 calls, 12.1543 microseconds average -fmaRepeat1 24.6287 milliseconds, 2093 calls, 11.7672 microseconds average -addRepeatGelu 24.2553 milliseconds, 698 calls, 34.7497 microseconds average -copyTranspose 24.2415 milliseconds, 1392 calls, 17.4149 microseconds average -addInPlace 20.4598 milliseconds, 1392 calls, 14.6981 microseconds average -scaleInPlace 12.8947 milliseconds, 696 calls, 18.5269 microseconds average -addRepeatScale 10.8749 milliseconds, 1344 calls, 8.09144 microseconds average -add 7.3752 milliseconds, 673 calls, 10.9587 microseconds average -convolutionPrep1 6.0929 milliseconds, 2 calls, 3.04645 milliseconds average -diagMaskInf 3.2818 milliseconds, 672 calls, 4.88363 microseconds average -convolutionPrep2 1.2268 milliseconds, 2 calls, 613.4 microseconds average -addRows 165.9 microseconds, 28 calls, 5.925 microseconds average +mulMatTiled 1.68817 seconds, 529 calls, 3.19125 milliseconds average +mulMatByRowTiled 562.722 milliseconds, 7803 calls, 72.1161 microseconds average +convolutionMain2Fixed 99.873 milliseconds +softMaxFixed 84.045 milliseconds, 696 calls, 120.754 microseconds average +copyTranspose 80.3619 milliseconds, 1392 calls, 57.7313 microseconds average +fmaRepeat1 71.9629 milliseconds, 2093 calls, 34.3827 microseconds average +convolutionMain 60.588 milliseconds +addRepeatScale 53.2349 milliseconds, 1344 calls, 39.6093 microseconds average +normFixed 34.7651 milliseconds, 2093 calls, 16.6102 microseconds average +addRepeatEx 31.9206 milliseconds, 2064 calls, 15.4654 microseconds average +copyConvert 30.7856 milliseconds, 1440 calls, 21.3789 microseconds average +addRepeatGelu 25.5167 milliseconds, 698 calls, 36.5569 microseconds average +softMaxLong 25.3214 milliseconds, 28 calls, 904.336 microseconds average +scaleInPlace 24.1527 milliseconds, 696 calls, 34.7022 microseconds average +softMax 21.0692 milliseconds, 672 calls, 31.353 microseconds average +addRepeat 19.8584 milliseconds, 744 calls, 26.6914 microseconds average +diagMaskInf 12.5615 milliseconds, 672 calls, 18.6927 microseconds average +convolutionPrep1 6.113 milliseconds, 2 calls, 3.0565 milliseconds average +convolutionPrep2 1.2294 milliseconds, 2 calls, 614.7 microseconds average +add 532.9 microseconds +addRows 178.9 microseconds, 28 calls, 6.38929 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 1.98347 MB RAM, 723.729 MB VRAM -Total 2.84085 MB RAM, 2.13462 GB VRAM +Context 1.9831 MB RAM, 723.782 MB VRAM +Total 2.84049 MB RAM, 2.13467 GB VRAM diff --git a/SampleClips/jfk-medium-vega7.txt b/SampleClips/jfk-medium-vega7.txt index 0be45d3..7753030 100644 --- a/SampleClips/jfk-medium-vega7.txt +++ b/SampleClips/jfk-medium-vega7.txt @@ -1,46 +1,47 @@ CPU Tasks -LoadModel 1.44983 seconds -RunComplete 9.9723 seconds -Run 9.8953 seconds -Callbacks 876.5 microseconds, 4 calls, 219.125 microseconds average -Spectrogram 100.602 milliseconds, 3 calls, 33.5339 milliseconds average -Sample 8.2281 milliseconds, 28 calls, 293.861 microseconds average -Encode 8.28685 seconds -Decode 1.60728 seconds -DecodeStep 1.599 seconds, 28 calls, 57.1073 milliseconds average +LoadModel 1.79203 seconds +RunComplete 8.79853 seconds +Run 8.71884 seconds +Callbacks 626.8 microseconds, 4 calls, 156.7 microseconds average +Spectrogram 17.3373 milliseconds, 3 calls, 5.7791 milliseconds average +Sample 5.449 milliseconds, 28 calls, 194.607 microseconds average +Encode 7.29966 seconds +Decode 1.41824 seconds +DecodeStep 1.41276 seconds, 28 calls, 50.4557 milliseconds average GPU Tasks -LoadModel 751.497 milliseconds -Run 9.73531 seconds -Encode 8.28303 seconds -EncodeLayer 7.19651 seconds, 24 calls, 299.855 milliseconds average -Decode 1.45228 seconds -DecodeStep 1.45225 seconds, 28 calls, 51.866 milliseconds average -DecodeLayer 1.31372 seconds, 672 calls, 1.95494 milliseconds average +LoadModel 930.123 milliseconds +Run 8.64946 seconds +Encode 7.34021 seconds +EncodeLayer 6.40759 seconds, 24 calls, 266.983 milliseconds average +Decode 1.30925 seconds +DecodeStep 1.30389 seconds, 28 calls, 46.5676 milliseconds average +DecodeLayer 1.19422 seconds, 672 calls, 1.77711 milliseconds average Compute Shaders -mulMatTiledEx 5.73474 seconds, 240 calls, 23.8947 milliseconds average -mulMatTiled 1.59442 seconds, 289 calls, 5.51703 milliseconds average -mulMatByRowTiled 708.039 milliseconds, 6507 calls, 108.812 microseconds average -mulMatByRowTiledEx 292.797 milliseconds, 1296 calls, 225.923 microseconds average -convolutionMain2Fixed 267.762 milliseconds -softMaxFixed 252.702 milliseconds, 696 calls, 363.078 microseconds average -addRepeat 122.774 milliseconds, 2808 calls, 43.7229 microseconds average -matReshapePanels 116.085 milliseconds, 145 calls, 800.583 microseconds average -convolutionMain 100.111 milliseconds -addRepeatGelu 78.6895 milliseconds, 698 calls, 112.736 microseconds average -normFixed 64.6521 milliseconds, 2093 calls, 30.8897 microseconds average -scaleInPlace 64.0629 milliseconds, 696 calls, 92.0444 microseconds average -copyConvert 62.7305 milliseconds, 1440 calls, 43.5628 microseconds average -softMax 50.9006 milliseconds, 700 calls, 72.7151 microseconds average -fmaRepeat1 49.6347 milliseconds, 2093 calls, 23.7146 microseconds average -copyTranspose 44.2248 milliseconds, 1392 calls, 31.7707 microseconds average -addInPlace 44.1766 milliseconds, 1392 calls, 31.7361 microseconds average -addRepeatScale 31.3737 milliseconds, 1344 calls, 23.3435 microseconds average -add 19.0564 milliseconds, 673 calls, 28.3156 microseconds average -convolutionPrep1 8.494 milliseconds, 2 calls, 4.247 milliseconds average -diagMaskInf 6.9839 milliseconds, 672 calls, 10.3927 microseconds average -convolutionPrep2 6.0876 milliseconds, 2 calls, 3.0438 milliseconds average -addRows 72 microseconds, 28 calls, 2.57143 microseconds average +mulMatTiledEx 4.91773 seconds, 240 calls, 20.4906 milliseconds average +mulMatTiled 1.47531 seconds, 289 calls, 5.10489 milliseconds average +mulMatByRowTiled 627.1 milliseconds, 6507 calls, 96.3731 microseconds average +softMaxFixed 268.285 milliseconds, 696 calls, 385.467 microseconds average +convolutionMain2Fixed 266.261 milliseconds +mulMatByRowTiledEx 241.609 milliseconds, 1296 calls, 186.427 microseconds average +matReshapePanels 156.683 milliseconds, 145 calls, 1.08057 milliseconds average +convolutionMain 102.091 milliseconds +copyConvert 77.6113 milliseconds, 1440 calls, 53.8967 microseconds average +addRepeatGelu 71.5118 milliseconds, 698 calls, 102.452 microseconds average +copyTranspose 63.3929 milliseconds, 1392 calls, 45.5409 microseconds average +normFixed 60.9615 milliseconds, 2093 calls, 29.1264 microseconds average +scaleInPlace 59.9341 milliseconds, 696 calls, 86.1122 microseconds average +fmaRepeat1 56.3539 milliseconds, 2093 calls, 26.9249 microseconds average +addRepeatEx 51.8785 milliseconds, 2064 calls, 25.1349 microseconds average +addRepeat 48.1192 milliseconds, 744 calls, 64.6763 microseconds average +softMaxLong 28.3411 milliseconds, 28 calls, 1.01218 milliseconds average +addRepeatScale 21.3646 milliseconds, 1344 calls, 15.8963 microseconds average +softMax 10.198 milliseconds, 672 calls, 15.1756 microseconds average +convolutionPrep1 9.1072 milliseconds, 2 calls, 4.5536 milliseconds average +diagMaskInf 8.3764 milliseconds, 672 calls, 12.4649 microseconds average +convolutionPrep2 7.4623 milliseconds, 2 calls, 3.73115 milliseconds average +add 2.3886 milliseconds +addRows 97.6 microseconds, 28 calls, 3.48571 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 1.9836 MB RAM, 771.354 MB VRAM -Total 2.84099 MB RAM, 2.18113 GB VRAM +Context 1.9831 MB RAM, 771.235 MB VRAM +Total 2.84049 MB RAM, 2.18101 GB VRAM diff --git a/SampleClips/jfk-medium-vega8.txt b/SampleClips/jfk-medium-vega8.txt index e6c4ec6..1e2344c 100644 --- a/SampleClips/jfk-medium-vega8.txt +++ b/SampleClips/jfk-medium-vega8.txt @@ -1,46 +1,47 @@  CPU Tasks -LoadModel 822.903 milliseconds -RunComplete 6.09091 seconds -Run 6.03901 seconds -Callbacks 526 microseconds, 4 calls, 131.5 microseconds average -Spectrogram 48.1091 milliseconds, 3 calls, 16.0364 milliseconds average -Sample 4.1241 milliseconds, 28 calls, 147.289 microseconds average -Encode 4.71938 seconds -Decode 1.31885 seconds -DecodeStep 1.31471 seconds, 28 calls, 46.954 milliseconds average +LoadModel 827.449 milliseconds +RunComplete 4.95485 seconds +Run 4.90459 seconds +Callbacks 343.6 microseconds, 4 calls, 85.9 microseconds average +Spectrogram 12.0208 milliseconds, 3 calls, 4.00693 milliseconds average +Sample 3.798 milliseconds, 28 calls, 135.643 microseconds average +Encode 3.78211 seconds +Decode 1.12187 seconds +DecodeStep 1.11805 seconds, 28 calls, 39.9304 milliseconds average GPU Tasks -LoadModel 443.09 milliseconds -Run 5.96203 seconds -Encode 4.76175 seconds -EncodeLayer 4.06066 seconds, 24 calls, 169.194 milliseconds average -Decode 1.20028 seconds -DecodeStep 1.20025 seconds, 28 calls, 42.8662 milliseconds average -DecodeLayer 1.08776 seconds, 672 calls, 1.61869 milliseconds average +LoadModel 429.525 milliseconds +Run 4.86319 seconds +Encode 3.82894 seconds +EncodeLayer 3.23846 seconds, 24 calls, 134.936 milliseconds average +Decode 1.03424 seconds +DecodeStep 1.03083 seconds, 28 calls, 36.8153 milliseconds average +DecodeLayer 934.97 milliseconds, 672 calls, 1.39132 milliseconds average Compute Shaders -mulMatTiledEx 3.20617 seconds, 240 calls, 13.3591 milliseconds average -mulMatTiled 997.08 milliseconds, 289 calls, 3.4501 milliseconds average -mulMatByRowTiled 584.778 milliseconds, 6507 calls, 89.869 microseconds average -mulMatByRowTiledEx 264.182 milliseconds, 1296 calls, 203.844 microseconds average -softMaxFixed 177.044 milliseconds, 696 calls, 254.373 microseconds average -convolutionMain2Fixed 132.808 milliseconds -addRepeat 94.4794 milliseconds, 2808 calls, 33.6465 microseconds average -matReshapePanels 78.392 milliseconds, 145 calls, 540.634 microseconds average -copyConvert 56.4795 milliseconds, 1440 calls, 39.2219 microseconds average -addRepeatGelu 54.2227 milliseconds, 698 calls, 77.683 microseconds average -scaleInPlace 44.2715 milliseconds, 696 calls, 63.6085 microseconds average -normFixed 40.1478 milliseconds, 2093 calls, 19.1819 microseconds average -fmaRepeat1 40.1179 milliseconds, 2093 calls, 19.1677 microseconds average -addInPlace 37.9729 milliseconds, 1392 calls, 27.2794 microseconds average -softMax 36.5724 milliseconds, 700 calls, 52.2463 microseconds average -convolutionMain 33.6316 milliseconds -copyTranspose 27.8482 milliseconds, 1392 calls, 20.0059 microseconds average -addRepeatScale 18.8881 milliseconds, 1344 calls, 14.0536 microseconds average -add 12.8944 milliseconds, 673 calls, 19.1596 microseconds average -diagMaskInf 8.9251 milliseconds, 672 calls, 13.2814 microseconds average -convolutionPrep2 3.653 milliseconds, 2 calls, 1.8265 milliseconds average -convolutionPrep1 3.5057 milliseconds, 2 calls, 1.75285 milliseconds average -addRows 64 microseconds, 28 calls, 2.28571 microseconds average +mulMatTiledEx 2.59201 seconds, 240 calls, 10.8 milliseconds average +mulMatTiled 733.345 milliseconds, 289 calls, 2.53753 milliseconds average +mulMatByRowTiled 492.505 milliseconds, 6507 calls, 75.6884 microseconds average +mulMatByRowTiledEx 226.315 milliseconds, 1296 calls, 174.626 microseconds average +softMaxFixed 169.603 milliseconds, 696 calls, 243.683 microseconds average +convolutionMain2Fixed 130.131 milliseconds +matReshapePanels 85.7723 milliseconds, 145 calls, 591.533 microseconds average +addRepeatGelu 52.8833 milliseconds, 698 calls, 75.764 microseconds average +copyConvert 49.8477 milliseconds, 1440 calls, 34.6165 microseconds average +scaleInPlace 47.7803 milliseconds, 696 calls, 68.6499 microseconds average +normFixed 44.0434 milliseconds, 2093 calls, 21.0432 microseconds average +fmaRepeat1 38.6945 milliseconds, 2093 calls, 18.4876 microseconds average +copyTranspose 36.6512 milliseconds, 1392 calls, 26.3299 microseconds average +addRepeatEx 33.6887 milliseconds, 2064 calls, 16.322 microseconds average +addRepeat 32.9016 milliseconds, 744 calls, 44.2226 microseconds average +convolutionMain 32.8426 milliseconds +softMaxLong 19.8753 milliseconds, 28 calls, 709.832 microseconds average +addRepeatScale 15.8724 milliseconds, 1344 calls, 11.8098 microseconds average +softMax 5.5277 milliseconds, 672 calls, 8.22574 microseconds average +diagMaskInf 5.1549 milliseconds, 672 calls, 7.67098 microseconds average +convolutionPrep2 3.9464 milliseconds, 2 calls, 1.9732 milliseconds average +convolutionPrep1 3.4569 milliseconds, 2 calls, 1.72845 milliseconds average +add 722.7 microseconds +addRows 80 microseconds, 28 calls, 2.85714 microseconds average Memory Usage Model 877.966 KB RAM, 1.42785 GB VRAM -Context 1.9836 MB RAM, 771.354 MB VRAM -Total 2.84099 MB RAM, 2.18113 GB VRAM +Context 1.9831 MB RAM, 771.235 MB VRAM +Total 2.84049 MB RAM, 2.18101 GB VRAM