Skip to content

Commit ef82fe7

Browse files
Merge pull request #1893 from CliMA/ck/fix_benchmark_script
Fix and update benchmark script
2 parents e7b2c9b + f940c2c commit ef82fe7

File tree

1 file changed

+101
-34
lines changed

1 file changed

+101
-34
lines changed

benchmarks/scripts/linear_vs_cartesian_indexing.jl renamed to benchmarks/scripts/indexing_and_static_ndranges.jl

Lines changed: 101 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
#=
22
julia --project=.buildkite
3-
using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing.jl"))
3+
using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndranges.jl"))
44
55
# Info:
6+
This script compares two things:
7+
- linear vs cartesian indexing
8+
- impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
9+
610
Linear indexing, when possible, has performance advantages
711
over using Cartesian indexing. Julia Base's Broadcast only
812
supports Cartesian indexing as it provides more general support
@@ -13,6 +17,18 @@ This script (re-)defines some broadcast machinery and tests
1317
the performance of vector vs array operations in a broadcast
1418
setting where linear indexing is allowed.
1519
20+
# Summary:
21+
- On the CPU:
22+
static NDRanges do not play an important role,
23+
but linear indexing is 2x faster than cartesian
24+
indexing.
25+
- On the GPU:
26+
static NDRanges DO play an important role,
27+
but we could (alternatively) see an improvement
28+
by using linear indexing. Supporting StaticNDRanges
29+
also impacts non-pointwise kernels, and yields
30+
nearly the same benefit as linear indexing.
31+
1632
# References:
1733
- https://github.com/CliMA/ClimaCore.jl/issues/1889
1834
- https://github.com/JuliaLang/julia/issues/28126
@@ -23,27 +39,43 @@ setting where linear indexing is allowed.
2339
Local Apple M1 Mac (CPU):
2440
```
2541
at_dot_call!($X_array, $Y_array):
26-
146 milliseconds, 558 microseconds
42+
143 milliseconds, 774 microseconds
2743
at_dot_call!($X_vector, $Y_vector):
28-
65 milliseconds, 531 microseconds
29-
custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
30-
66 milliseconds, 735 microseconds
31-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
32-
145 milliseconds, 957 microseconds
33-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
34-
66 milliseconds, 320 microseconds
44+
65 milliseconds, 567 microseconds
45+
custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
46+
66 milliseconds, 870 microseconds
47+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
48+
143 milliseconds, 643 microseconds
49+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
50+
65 milliseconds, 778 microseconds
51+
custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
52+
65 milliseconds, 765 microseconds
53+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
54+
144 milliseconds, 271 microseconds
55+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
56+
66 milliseconds, 376 microseconds
3557
```
3658
3759
Clima A100
3860
```
61+
at_dot_call!($X_array, $Y_array):
62+
6 milliseconds, 775 microseconds
3963
at_dot_call!($X_vector, $Y_vector):
40-
2 milliseconds, 848 microseconds
41-
custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
42-
2 milliseconds, 537 microseconds
43-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
44-
8 milliseconds, 804 microseconds
45-
custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
46-
2 milliseconds, 545 microseconds
64+
2 milliseconds, 834 microseconds
65+
custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))):
66+
2 milliseconds, 547 microseconds
67+
custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
68+
2 milliseconds, 561 microseconds
69+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
70+
4 milliseconds, 160 microseconds
71+
custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
72+
2 milliseconds, 584 microseconds
73+
custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
74+
2 milliseconds, 540 microseconds
75+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
76+
2 milliseconds, 715 microseconds
77+
custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
78+
2 milliseconds, 547 microseconds
4779
```
4880
=#
4981

@@ -239,7 +271,7 @@ function at_dot_call!(X, Y)
239271
return nothing
240272
end;
241273

242-
function custom_kernel!(X, Y, ::Val{N}) where {N}
274+
function custom_sol_kernel!(X, Y, ::Val{N}) where {N}
243275
(; x1, x2, x3) = X
244276
(; y1) = Y
245277
kernel = CUDA.@cuda always_inline = true launch = false custom_kernel_knl!(
@@ -267,7 +299,27 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
267299
return nothing
268300
end;
269301

270-
function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
302+
abstract type AbstractUniversalSizes{Nv, Nij} end
303+
struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
304+
Nh::Int
305+
end
306+
struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
307+
308+
get_Nv(::AbstractUniversalSizes{Nv}) where {Nv} = Nv
309+
get_Nij(::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = Nij
310+
get_Nh(us::UniversalSizesCC) = us.Nh
311+
get_Nh(::UniversalSizesStatic{Nv, Nij, Nh}) where {Nv, Nij, Nh} = Nh
312+
get_N(us::AbstractUniversalSizes{Nv, Nij}) where {Nv, Nij} = prod((Nv,Nij,Nij,1,get_Nh(us)))
313+
UniversalSizesCC(Nv, Nij, Nh) = UniversalSizesCC{Nv, Nij}(Nh)
314+
UniversalSizesStatic(Nv, Nij, Nh) = UniversalSizesStatic{Nv, Nij, Nh}()
315+
using Test
316+
us_tup = (1, 2, 3)
317+
@test get_Nv(UniversalSizesCC(us_tup...)) == get_Nv(UniversalSizesStatic(us_tup...))
318+
@test get_Nij(UniversalSizesCC(us_tup...)) == get_Nij(UniversalSizesStatic(us_tup...))
319+
@test get_Nh(UniversalSizesCC(us_tup...)) == get_Nh(UniversalSizesStatic(us_tup...))
320+
@test get_N(UniversalSizesCC(us_tup...)) == get_N(UniversalSizesStatic(us_tup...))
321+
322+
function custom_kernel_bc!(X, Y, us::AbstractUniversalSizes; printtb=true, use_pw=true)
271323
(; x1, x2, x3) = X
272324
(; y1) = Y
273325
bc_base = @lazy @. y1 = myadd(x1, x2, x3)
@@ -281,7 +333,7 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
281333
end
282334
else
283335
for i in 1:100 # reduce variance / impact of launch latency
284-
@inbounds @simd for j in 1:N
336+
@inbounds @simd for j in 1:get_N(us)
285337
y1[j] = bc[j]
286338
end
287339
end
@@ -291,28 +343,28 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
291343
CUDA.@cuda always_inline = true launch = false custom_kernel_knl_bc!(
292344
y1,
293345
bc,
294-
Val(N),
346+
us,
295347
)
296348
config = CUDA.launch_configuration(kernel.fun)
297349
threads = min(N, config.threads)
298350
blocks = cld(N, threads)
299351
printtb && @show blocks, threads
300352
for i in 1:100 # reduce variance / impact of launch latency
301-
kernel(y1, bc, Val(N); threads, blocks)
353+
kernel(y1, bc,us; threads, blocks)
302354
end
303355
end
304356
return nothing
305357
end;
306358
@inline get_cart_lin_index(bc, n, I) = I
307359
@inline get_cart_lin_index(bc::Base.Broadcast.Broadcasted, n, I) =
308360
CartesianIndices(map(x -> Base.OneTo(x), n))[I]
309-
function custom_kernel_knl_bc!(y1, bc, ::Val{N}) where {N}
361+
function custom_kernel_knl_bc!(y1, bc, us)
310362
@inbounds begin
311363
I = (CUDA.blockIdx().x - Int32(1)) * CUDA.blockDim().x + CUDA.threadIdx().x
312-
n = size(y1)
313-
if 1 I N
314-
ind = get_cart_lin_index(bc, n, I)
315-
y1[ind] = bc[ind]
364+
if 1 I get_N(us)
365+
n = (get_Nv(us), get_Nij(us), get_Nij(us), 1, get_Nh(us))
366+
ci = get_cart_lin_index(bc, n, I)
367+
y1[ci] = bc[ci]
316368
end
317369
end
318370
return nothing
@@ -327,16 +379,31 @@ X_vector = to_vec(X_array);
327379
Y_vector = to_vec(Y_array);
328380
at_dot_call!(X_array, Y_array)
329381
at_dot_call!(X_vector, Y_vector)
330-
# custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1)))
331-
custom_kernel_bc!(X_vector, Y_vector, Val(length(X_vector.x1)))
332-
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=false)
333-
custom_kernel_bc!(X_array, Y_array, Val(length(X_vector.x1)); use_pw=true)
382+
N = length(X_vector.x1)
383+
(Nv, Nij, _, Nf, Nh) = size(Y_array.y1);
384+
us = UniversalSizesCC(Nv, Nij, Nh);
385+
uss = UniversalSizesStatic(Nv, Nij, Nh);
386+
@test get_N(us) == N
387+
@test get_N(uss) == N
388+
iscpu = ArrayType === identity
389+
iscpu || custom_sol_kernel!(X_vector, Y_vector, Val(N))
390+
custom_kernel_bc!(X_vector, Y_vector, us)
391+
custom_kernel_bc!(X_array, Y_array, us; use_pw=false)
392+
custom_kernel_bc!(X_array, Y_array, us; use_pw=true)
393+
394+
custom_kernel_bc!(X_vector, Y_vector, uss)
395+
custom_kernel_bc!(X_array, Y_array, uss; use_pw=false)
396+
custom_kernel_bc!(X_array, Y_array, uss; use_pw=true)
334397

335398
@pretty_belapsed at_dot_call!($X_array, $Y_array) # slow
336399
@pretty_belapsed at_dot_call!($X_vector, $Y_vector) # fast
337-
# @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1))))
338-
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1)));printtb=false)
339-
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=false)
340-
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1)));printtb=false, use_pw=true)
400+
iscpu || @pretty_belapsed custom_sol_kernel!($X_vector, $Y_vector, $(Val(N)))
401+
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $us; printtb=false)
402+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=false)
403+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $us; printtb=false, use_pw=true)
404+
405+
@pretty_belapsed custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb=false)
406+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=false)
407+
@pretty_belapsed custom_kernel_bc!($X_array, $Y_array, $uss; printtb=false, use_pw=true)
341408

342409
#! format: on

0 commit comments

Comments
 (0)