1
1
#=
2
2
julia --project=.buildkite
3
- using Revise; include(joinpath("benchmarks", "scripts", "linear_vs_cartesian_indexing .jl"))
3
+ using Revise; include(joinpath("benchmarks", "scripts", "indexing_and_static_ndranges .jl"))
4
4
5
5
# Info:
6
+ This script compares two things:
7
+ - linear vs cartesian indexing
8
+ - impact of static vs dynamic NDRanges (https://juliagpu.github.io/KernelAbstractions.jl/dev/examples/memcopy_static/)
9
+
6
10
Linear indexing, when possible, has performance advantages
7
11
over using Cartesian indexing. Julia Base's Broadcast only
8
12
supports Cartesian indexing as it provides more general support
@@ -13,6 +17,18 @@ This script (re-)defines some broadcast machinery and tests
13
17
the performance of vector vs array operations in a broadcast
14
18
setting where linear indexing is allowed.
15
19
20
+ # Summary:
21
+ - On the CPU:
22
+ static NDRanges do not play an important role,
23
+ but linear indexing is 2x faster than cartesian
24
+ indexing.
25
+ - On the GPU:
26
+ static NDRanges DO play an important role,
27
+ but we could (alternatively) see an improvement
28
+ by using linear indexing. Supporting StaticNDRanges
29
+ also impacts non-pointwise kernels, and yields
30
+ nearly the same benefit as linear indexing.
31
+
16
32
# References:
17
33
- https://github.com/CliMA/ClimaCore.jl/issues/1889
18
34
- https://github.com/JuliaLang/julia/issues/28126
@@ -23,27 +39,43 @@ setting where linear indexing is allowed.
23
39
Local Apple M1 Mac (CPU):
24
40
```
25
41
at_dot_call!($X_array, $Y_array):
26
- 146 milliseconds, 558 microseconds
42
+ 143 milliseconds, 774 microseconds
27
43
at_dot_call!($X_vector, $Y_vector):
28
- 65 milliseconds, 531 microseconds
29
- custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
30
- 66 milliseconds, 735 microseconds
31
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
32
- 145 milliseconds, 957 microseconds
33
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
34
- 66 milliseconds, 320 microseconds
44
+ 65 milliseconds, 567 microseconds
45
+ custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
46
+ 66 milliseconds, 870 microseconds
47
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
48
+ 143 milliseconds, 643 microseconds
49
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
50
+ 65 milliseconds, 778 microseconds
51
+ custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
52
+ 65 milliseconds, 765 microseconds
53
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
54
+ 144 milliseconds, 271 microseconds
55
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
56
+ 66 milliseconds, 376 microseconds
35
57
```
36
58
37
59
Clima A100
38
60
```
61
+ at_dot_call!($X_array, $Y_array):
62
+ 6 milliseconds, 775 microseconds
39
63
at_dot_call!($X_vector, $Y_vector):
40
- 2 milliseconds, 848 microseconds
41
- custom_kernel_bc!($X_vector, $Y_vector, $(Val(length(X_vector.x1))); printtb = false):
42
- 2 milliseconds, 537 microseconds
43
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = false):
44
- 8 milliseconds, 804 microseconds
45
- custom_kernel_bc!($X_array, $Y_array, $(Val(length(X_vector.x1))); printtb = false, use_pw = true):
46
- 2 milliseconds, 545 microseconds
64
+ 2 milliseconds, 834 microseconds
65
+ custom_sol_kernel!($X_vector, $Y_vector, $(Val(N))):
66
+ 2 milliseconds, 547 microseconds
67
+ custom_kernel_bc!($X_vector, $Y_vector, $us; printtb = false):
68
+ 2 milliseconds, 561 microseconds
69
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = false):
70
+ 4 milliseconds, 160 microseconds
71
+ custom_kernel_bc!($X_array, $Y_array, $us; printtb = false, use_pw = true):
72
+ 2 milliseconds, 584 microseconds
73
+ custom_kernel_bc!($X_vector, $Y_vector, $uss; printtb = false):
74
+ 2 milliseconds, 540 microseconds
75
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = false):
76
+ 2 milliseconds, 715 microseconds
77
+ custom_kernel_bc!($X_array, $Y_array, $uss; printtb = false, use_pw = true):
78
+ 2 milliseconds, 547 microseconds
47
79
```
48
80
=#
49
81
@@ -239,7 +271,7 @@ function at_dot_call!(X, Y)
239
271
return nothing
240
272
end ;
241
273
242
- function custom_kernel ! (X, Y, :: Val{N} ) where {N}
274
+ function custom_sol_kernel ! (X, Y, :: Val{N} ) where {N}
243
275
(; x1, x2, x3) = X
244
276
(; y1) = Y
245
277
kernel = CUDA. @cuda always_inline = true launch = false custom_kernel_knl! (
@@ -267,7 +299,27 @@ function custom_kernel_knl!(y1, x1, x2, x3, ::Val{N}) where {N}
267
299
return nothing
268
300
end ;
269
301
270
- function custom_kernel_bc! (X, Y, :: Val{N} ; printtb= true , use_pw= true ) where {N}
302
+ abstract type AbstractUniversalSizes{Nv, Nij} end
303
+ struct UniversalSizesCC{Nv, Nij} <: AbstractUniversalSizes{Nv, Nij}
304
+ Nh:: Int
305
+ end
306
+ struct UniversalSizesStatic{Nv, Nij, Nh} <: AbstractUniversalSizes{Nv, Nij} end
307
+
308
+ get_Nv (:: AbstractUniversalSizes{Nv} ) where {Nv} = Nv
309
+ get_Nij (:: AbstractUniversalSizes{Nv, Nij} ) where {Nv, Nij} = Nij
310
+ get_Nh (us:: UniversalSizesCC ) = us. Nh
311
+ get_Nh (:: UniversalSizesStatic{Nv, Nij, Nh} ) where {Nv, Nij, Nh} = Nh
312
+ get_N (us:: AbstractUniversalSizes{Nv, Nij} ) where {Nv, Nij} = prod ((Nv,Nij,Nij,1 ,get_Nh (us)))
313
+ UniversalSizesCC (Nv, Nij, Nh) = UniversalSizesCC {Nv, Nij} (Nh)
314
+ UniversalSizesStatic (Nv, Nij, Nh) = UniversalSizesStatic {Nv, Nij, Nh} ()
315
+ using Test
316
+ us_tup = (1 , 2 , 3 )
317
+ @test get_Nv (UniversalSizesCC (us_tup... )) == get_Nv (UniversalSizesStatic (us_tup... ))
318
+ @test get_Nij (UniversalSizesCC (us_tup... )) == get_Nij (UniversalSizesStatic (us_tup... ))
319
+ @test get_Nh (UniversalSizesCC (us_tup... )) == get_Nh (UniversalSizesStatic (us_tup... ))
320
+ @test get_N (UniversalSizesCC (us_tup... )) == get_N (UniversalSizesStatic (us_tup... ))
321
+
322
+ function custom_kernel_bc! (X, Y, us:: AbstractUniversalSizes ; printtb= true , use_pw= true )
271
323
(; x1, x2, x3) = X
272
324
(; y1) = Y
273
325
bc_base = @lazy @. y1 = myadd (x1, x2, x3)
@@ -281,7 +333,7 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
281
333
end
282
334
else
283
335
for i in 1 : 100 # reduce variance / impact of launch latency
284
- @inbounds @simd for j in 1 : N
336
+ @inbounds @simd for j in 1 : get_N (us)
285
337
y1[j] = bc[j]
286
338
end
287
339
end
@@ -291,28 +343,28 @@ function custom_kernel_bc!(X, Y, ::Val{N}; printtb=true, use_pw=true) where {N}
291
343
CUDA. @cuda always_inline = true launch = false custom_kernel_knl_bc! (
292
344
y1,
293
345
bc,
294
- Val (N) ,
346
+ us ,
295
347
)
296
348
config = CUDA. launch_configuration (kernel. fun)
297
349
threads = min (N, config. threads)
298
350
blocks = cld (N, threads)
299
351
printtb && @show blocks, threads
300
352
for i in 1 : 100 # reduce variance / impact of launch latency
301
- kernel (y1, bc, Val (N) ; threads, blocks)
353
+ kernel (y1, bc,us ; threads, blocks)
302
354
end
303
355
end
304
356
return nothing
305
357
end ;
306
358
@inline get_cart_lin_index (bc, n, I) = I
307
359
@inline get_cart_lin_index (bc:: Base.Broadcast.Broadcasted , n, I) =
308
360
CartesianIndices (map (x -> Base. OneTo (x), n))[I]
309
- function custom_kernel_knl_bc! (y1, bc, :: Val{N} ) where {N}
361
+ function custom_kernel_knl_bc! (y1, bc, us)
310
362
@inbounds begin
311
363
I = (CUDA. blockIdx (). x - Int32 (1 )) * CUDA. blockDim (). x + CUDA. threadIdx (). x
312
- n = size (y1 )
313
- if 1 ≤ I ≤ N
314
- ind = get_cart_lin_index (bc, n, I)
315
- y1[ind ] = bc[ind ]
364
+ if 1 ≤ I ≤ get_N (us )
365
+ n = ( get_Nv (us), get_Nij (us), get_Nij (us), 1 , get_Nh (us))
366
+ ci = get_cart_lin_index (bc, n, I)
367
+ y1[ci ] = bc[ci ]
316
368
end
317
369
end
318
370
return nothing
@@ -327,16 +379,31 @@ X_vector = to_vec(X_array);
327
379
Y_vector = to_vec (Y_array);
328
380
at_dot_call! (X_array, Y_array)
329
381
at_dot_call! (X_vector, Y_vector)
330
- # custom_kernel!(X_vector, Y_vector, Val(length(X_vector.x1)))
331
- custom_kernel_bc! (X_vector, Y_vector, Val (length (X_vector. x1)))
332
- custom_kernel_bc! (X_array, Y_array, Val (length (X_vector. x1)); use_pw= false )
333
- custom_kernel_bc! (X_array, Y_array, Val (length (X_vector. x1)); use_pw= true )
382
+ N = length (X_vector. x1)
383
+ (Nv, Nij, _, Nf, Nh) = size (Y_array. y1);
384
+ us = UniversalSizesCC (Nv, Nij, Nh);
385
+ uss = UniversalSizesStatic (Nv, Nij, Nh);
386
+ @test get_N (us) == N
387
+ @test get_N (uss) == N
388
+ iscpu = ArrayType === identity
389
+ iscpu || custom_sol_kernel! (X_vector, Y_vector, Val (N))
390
+ custom_kernel_bc! (X_vector, Y_vector, us)
391
+ custom_kernel_bc! (X_array, Y_array, us; use_pw= false )
392
+ custom_kernel_bc! (X_array, Y_array, us; use_pw= true )
393
+
394
+ custom_kernel_bc! (X_vector, Y_vector, uss)
395
+ custom_kernel_bc! (X_array, Y_array, uss; use_pw= false )
396
+ custom_kernel_bc! (X_array, Y_array, uss; use_pw= true )
334
397
335
398
@pretty_belapsed at_dot_call! ($ X_array, $ Y_array) # slow
336
399
@pretty_belapsed at_dot_call! ($ X_vector, $ Y_vector) # fast
337
- # @pretty_belapsed custom_kernel!($X_vector, $Y_vector, $(Val(length(X_vector.x1))))
338
- @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ (Val (length (X_vector. x1)));printtb= false )
339
- @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ (Val (length (X_vector. x1)));printtb= false , use_pw= false )
340
- @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ (Val (length (X_vector. x1)));printtb= false , use_pw= true )
400
+ iscpu || @pretty_belapsed custom_sol_kernel! ($ X_vector, $ Y_vector, $ (Val (N)))
401
+ @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ us; printtb= false )
402
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ us; printtb= false , use_pw= false )
403
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ us; printtb= false , use_pw= true )
404
+
405
+ @pretty_belapsed custom_kernel_bc! ($ X_vector, $ Y_vector, $ uss; printtb= false )
406
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ uss; printtb= false , use_pw= false )
407
+ @pretty_belapsed custom_kernel_bc! ($ X_array, $ Y_array, $ uss; printtb= false , use_pw= true )
341
408
342
409
# ! format: on
0 commit comments