diff --git a/perf/results/a100_cuda.txt b/perf/results/a100_cuda.txt new file mode 100644 index 0000000..8135704 --- /dev/null +++ b/perf/results/a100_cuda.txt @@ -0,0 +1,27 @@ +2023-10-12T17:08:07-04:00 +Running build-cuda/perf/benchmarks +Run on (48 X 3614.93 MHz CPU s) +CPU Caches: + L1 Data 32 KiB (x48) + L1 Instruction 32 KiB (x48) + L2 Unified 512 KiB (x48) + L3 Unified 32768 KiB (x8) +Load Average: 6.14, 6.45, 6.85 +GPU: + NVIDIA A100-SXM4-40GB + L2 Cache: 40960 KiB + Number of SMs: x108 + Peak Memory Bandwidth: 1555 (GB/s) +------------------------------------------------------------------------------------------------------ +Benchmark Time CPU Iterations BW (GB/s) +------------------------------------------------------------------------------------------------------ +p2rng_generate_cuda/1048576/manual_time 0.056 ms 0.063 ms 10894 74.3961/s +p2rng_generate_cuda/2097152/manual_time 0.107 ms 0.114 ms 6528 78.2329/s +p2rng_generate_cuda/4194304/manual_time 0.218 ms 0.225 ms 3208 76.9102/s +p2rng_generate_cuda/8388608/manual_time 0.450 ms 0.456 ms 1557 74.6077/s +p2rng_generate_cuda/16777216/manual_time 0.931 ms 0.936 ms 752 72.1066/s +p2rng_generate_cuda/1048576/manual_time 0.054 ms 0.061 ms 12995 155.56/s +p2rng_generate_cuda/2097152/manual_time 0.107 ms 0.114 ms 6556 157.118/s +p2rng_generate_cuda/4194304/manual_time 0.217 ms 0.224 ms 3222 154.383/s +p2rng_generate_cuda/8388608/manual_time 0.448 ms 0.454 ms 1564 149.915/s +p2rng_generate_cuda/16777216/manual_time 0.927 ms 0.932 ms 755 144.804/s \ No newline at end of file diff --git a/perf/results/epic7543_openmp.txt b/perf/results/epic7543_openmp.txt index b5fecef..a6833dd 100644 --- a/perf/results/epic7543_openmp.txt +++ b/perf/results/epic7543_openmp.txt @@ -1,4 +1,4 @@ -2023-10-09T10:19:04-04:00 +2023-10-12T16:49:40-04:00 Running build/perf/benchmarks Run on (128 X 2794.65 MHz CPU s) CPU Caches: @@ -6,27 +6,27 @@ CPU Caches: L1 Instruction 32 KiB (x64) L2 Unified 512 KiB (x64) L3 Unified 32768 KiB (x16) -Load Average: 0.01, 0.09, 0.09 +Load Average: 0.07, 0.04, 0.05 ------------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations BW (GB/s) ------------------------------------------------------------------------------------------------------ -generate_stl/1048576 1.13 ms 1.13 ms 619 3.70477/s -generate_stl/2097152 2.26 ms 2.26 ms 309 3.70545/s -generate_stl/4194304 4.53 ms 4.53 ms 154 3.70497/s -generate_stl/8388608 9.07 ms 9.07 ms 76 3.70131/s -generate_stl/16777216 18.2 ms 18.2 ms 39 3.6901/s -generate_stl/1048576 1.13 ms 1.13 ms 618 7.41022/s -generate_stl/2097152 2.26 ms 2.26 ms 309 7.41036/s -generate_stl/4194304 4.55 ms 4.55 ms 154 7.37824/s -generate_stl/8388608 9.16 ms 9.16 ms 76 7.32829/s -generate_stl/16777216 18.4 ms 18.4 ms 38 7.31129/s -generate_p2rng_openmp/1048576/real_time 0.059 ms 0.059 ms 11925 71.6939/s -generate_p2rng_openmp/2097152/real_time 0.072 ms 0.072 ms 9691 117.179/s -generate_p2rng_openmp/4194304/real_time 0.105 ms 0.105 ms 6439 159.14/s -generate_p2rng_openmp/8388608/real_time 0.199 ms 0.199 ms 3643 168.367/s -generate_p2rng_openmp/16777216/real_time 0.341 ms 0.341 ms 2043 196.934/s -generate_p2rng_openmp/1048576/real_time 0.058 ms 0.058 ms 11740 143.954/s -generate_p2rng_openmp/2097152/real_time 0.082 ms 0.082 ms 8755 205.348/s -generate_p2rng_openmp/4194304/real_time 0.129 ms 0.129 ms 6061 259.149/s -generate_p2rng_openmp/8388608/real_time 0.255 ms 0.254 ms 3503 262.861/s -generate_p2rng_openmp/16777216/real_time 0.360 ms 0.360 ms 1946 372.631/s +stl_generate/1048576 1.13 ms 1.13 ms 618 3.70608/s +stl_generate/2097152 2.26 ms 2.26 ms 309 3.70611/s +stl_generate/4194304 4.53 ms 4.53 ms 155 3.70117/s +stl_generate/8388608 9.06 ms 9.06 ms 77 3.7053/s +stl_generate/16777216 18.2 ms 18.2 ms 38 3.69253/s +stl_generate/1048576 1.16 ms 1.16 ms 601 7.21236/s +stl_generate/2097152 2.33 ms 2.33 ms 301 7.21276/s +stl_generate/4194304 4.66 ms 4.66 ms 150 7.1974/s +stl_generate/8388608 9.40 ms 9.40 ms 74 7.1364/s +stl_generate/16777216 18.8 ms 18.8 ms 37 7.12223/s +p2rng_generate_openmp/1048576/real_time 0.060 ms 0.060 ms 11738 70.2157/s +p2rng_generate_openmp/2097152/real_time 0.074 ms 0.074 ms 9387 112.829/s +p2rng_generate_openmp/4194304/real_time 0.110 ms 0.110 ms 6364 152.605/s +p2rng_generate_openmp/8388608/real_time 0.192 ms 0.192 ms 3797 174.927/s +p2rng_generate_openmp/16777216/real_time 0.335 ms 0.335 ms 2090 200.584/s +p2rng_generate_openmp/1048576/real_time 0.061 ms 0.061 ms 11233 136.619/s +p2rng_generate_openmp/2097152/real_time 0.081 ms 0.081 ms 8426 207.024/s +p2rng_generate_openmp/4194304/real_time 0.136 ms 0.136 ms 5922 247.07/s +p2rng_generate_openmp/8388608/real_time 0.214 ms 0.214 ms 3307 313.398/s +p2rng_generate_openmp/16777216/real_time 0.386 ms 0.386 ms 1819 347.603/s \ No newline at end of file diff --git a/perf/results/mi210_rocm.txt b/perf/results/mi210_rocm.txt index d3559d8..64bdf4f 100644 --- a/perf/results/mi210_rocm.txt +++ b/perf/results/mi210_rocm.txt @@ -1,12 +1,11 @@ -2023-10-09T10:14:41-04:00 -Running perf/benchmarks +unning build-rocm/perf/benchmarks Run on (128 X 2794.65 MHz CPU s) CPU Caches: L1 Data 32 KiB (x64) L1 Instruction 32 KiB (x64) L2 Unified 512 KiB (x64) L3 Unified 32768 KiB (x16) -Load Average: 0.27, 0.17, 0.10 +Load Average: 4.98, 5.50, 3.01 GPU: AMD Instinct MI210 L2 Cache: 8192 KiB @@ -15,13 +14,13 @@ GPU: ------------------------------------------------------------------------------------------------------ Benchmark Time CPU Iterations BW (GB/s) ------------------------------------------------------------------------------------------------------ -generate_p2rng_rocm/1048576/manual_time 0.057 ms 0.073 ms 12343 73.9729/s -generate_p2rng_rocm/2097152/manual_time 0.097 ms 0.116 ms 7247 86.8508/s -generate_p2rng_rocm/4194304/manual_time 0.180 ms 0.198 ms 3894 93.3282/s -generate_p2rng_rocm/8388608/manual_time 0.352 ms 0.370 ms 1991 95.4046/s -generate_p2rng_rocm/16777216/manual_time 0.709 ms 0.727 ms 987 94.6272/s -generate_p2rng_rocm/1048576/manual_time 0.058 ms 0.077 ms 12143 145.125/s -generate_p2rng_rocm/2097152/manual_time 0.098 ms 0.117 ms 7087 170.892/s -generate_p2rng_rocm/4194304/manual_time 0.181 ms 0.199 ms 3865 185.699/s -generate_p2rng_rocm/8388608/manual_time 0.354 ms 0.373 ms 1971 189.321/s -generate_p2rng_rocm/16777216/manual_time 0.715 ms 0.732 ms 979 187.772/s \ No newline at end of file +p2rng_generate_rocm/1048576/manual_time 0.057 ms 0.078 ms 12358 73.7611/s +p2rng_generate_rocm/2097152/manual_time 0.097 ms 0.117 ms 7221 86.5602/s +p2rng_generate_rocm/4194304/manual_time 0.180 ms 0.197 ms 3886 93.2582/s +p2rng_generate_rocm/8388608/manual_time 0.352 ms 0.370 ms 1990 95.3958/s +p2rng_generate_rocm/16777216/manual_time 0.710 ms 0.727 ms 987 94.5727/s +p2rng_generate_rocm/1048576/manual_time 0.058 ms 0.076 ms 12129 145.378/s +p2rng_generate_rocm/2097152/manual_time 0.098 ms 0.117 ms 7122 170.609/s +p2rng_generate_rocm/4194304/manual_time 0.181 ms 0.200 ms 3876 185.695/s +p2rng_generate_rocm/8388608/manual_time 0.355 ms 0.373 ms 1973 189.193/s +p2rng_generate_rocm/16777216/manual_time 0.715 ms 0.734 ms 979 187.703/s