Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow opt-out of implicit bounds-checking #563

Merged
merged 1 commit into from
Feb 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions docs/src/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,87 @@ Major refactor of KernelAbstractions. In particular:
- Removal of the event system. Kernel are now implicitly ordered.
- Removal of backend packages, backends are now directly provided by CUDA.jl and similar

#### 0.9.33
Restricts the semantics of `@synchronize` to require convergent execution.
The OpenCL backend had several miss-compilations due to divergent execution of `@synchronize`.
The `CPU` backend always had this limitation and upon investigation the CUDA backend similarly requires convergent execution,
but allows for a wider set of valid kernels.

This highlighted a design flaw in KernelAbstractions. Most GPU implementations execute KernelAbstraction workgroups on static blocks
This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indicies,
KernelAbstraction would insert a dynamic boundscheck.

Prior to v0.9.33 a kernel like

```julia
@kernel function localmem(A)
N = @uniform prod(@groupsize())
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem[i] = i
@synchronize
A[I] = lmem[N - i + 1]
end
```

was lowered to GPU backends like this:

```julia
function localmem_gpu(A)
if __validindex(__ctx__)
N = @uniform prod(@groupsize())
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem[i] = i
@synchronize
A[I] = lmem[N - i + 1]
end
end
```

This would cause an implicit divergent execution of `@synchronize`.

With this release the lowering has been changed to:

```julia
function localmem_gpu(A)
__valid_lane__ __validindex(__ctx__)
N = @uniform prod(@groupsize())
lmem = @localmem Int (N,) # Ok iff groupsize is static
if __valid_lane__
I = @index(Global, Linear)
i = @index(Local, Linear)
lmem[i] = i
end
@synchronize
if __valid_lane__
A[I] = lmem[N - i + 1]
end
end
```

Note that this follow the CPU lowering with respect to `@uniform`, `@private`, `@localmem` and `@synchronize`.

Since this transformation can be disruptive, user can now opt out of the implicit bounds-check,
but users must avoid the use of `@index(Global)` and instead use their own derivation based on `@index(Group)` and `@index(Local)`.

```julia
@kernel unsafe_indicies=false function localmem(A)
N = @uniform prod(@groupsize())
gI = @index(Group, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem[i] = i
@synchronize
I = (gI - 1) * N + i
if i <= N && I <= length(A)
A[I] = lmem[N - i + 1]
end
end
```

## Semantic differences

### To CUDA.jl/AMDGPU.jl
Expand Down
12 changes: 9 additions & 3 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ synchronize(backend)
```
"""
macro kernel(expr)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indicies=# false)
end

"""
Expand All @@ -60,6 +60,7 @@ This allows for two different configurations:

1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
3. `unsafe_indicies={false, true}`: Disables the implicit validation of indicies, users must avoid `@index(Global)`.

- [`@context`](@ref)

Expand All @@ -68,9 +69,10 @@ This allows for two different configurations:
"""
macro kernel(ex...)
if length(ex) == 1
return __kernel(ex[1], true, false)
return __kernel(ex[1], true, false, false)
else
generate_cpu = true
unsafe_indicies = false
force_inbounds = false
for i in 1:(length(ex) - 1)
if ex[i] isa Expr && ex[i].head == :(=) &&
Expand All @@ -79,16 +81,20 @@ macro kernel(ex...)
elseif ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
force_inbounds = ex[i].args[2]
elseif ex[i] isa Expr && ex[i].head == :(=) &&
ex[i].args[1] == :unsafe_indicies && ex[i].args[2] isa Bool
unsafe_indicies = ex[i].args[2]
else
error(
"Configuration should be of form:\n" *
"* `cpu=true`\n" *
"* `inbounds=false`\n" *
"* `unsafe_indicies=false`\n" *
"got `", ex[i], "`",
)
end
end
return __kernel(ex[end], generate_cpu, force_inbounds)
return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indicies)
end
end

Expand Down
12 changes: 8 additions & 4 deletions src/macros.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ function find_return(stmt)
end

# XXX: Proper errors
function __kernel(expr, generate_cpu = true, force_inbounds = false)
function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indicies = true)
def = splitdef(expr)
name = def[:name]
args = def[:args]
Expand Down Expand Up @@ -46,7 +46,7 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)

def_gpu = deepcopy(def)
def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
transform_gpu!(def_gpu, constargs, force_inbounds)
transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indicies)
gpu_function = combinedef(def_gpu)

# create constructor functions
Expand Down Expand Up @@ -78,7 +78,7 @@ end

# The easy case, transform the function for GPU execution
# - mark constant arguments by applying `constify`.
function transform_gpu!(def, constargs, force_inbounds)
function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies)
let_constargs = Expr[]
for (i, arg) in enumerate(def[:args])
if constargs[i]
Expand All @@ -94,7 +94,11 @@ function transform_gpu!(def, constargs, force_inbounds)
if force_inbounds
push!(new_stmts, Expr(:inbounds, true))
end
append!(new_stmts, split(emit_gpu, body.args))
if !unsafe_indicies
append!(new_stmts, split(emit_gpu, body.args))
else
push!(new_stmts, body)
end
if force_inbounds
push!(new_stmts, Expr(:inbounds, :pop))
end
Expand Down
15 changes: 14 additions & 1 deletion test/localmem.jl
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,22 @@ end
end
end

@kernel unsafe_indicies = false function localmem_unsafe_indicies(A)
N = @uniform prod(@groupsize())
gI = @index(Group, Linear)
i = @index(Local, Linear)
lmem = @localmem Int (N,) # Ok iff groupsize is static
lmem[i] = i
@synchronize
I = (gI - 1) * N + i
if I <= length(A)
A[I] = lmem[N - i + 1]
end
end

function localmem_testsuite(backend, ArrayT)
@testset "kernels" begin
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16))
@testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indicies(backend(), 16))
A = ArrayT{Int}(undef, 64)
kernel!(A, ndrange = size(A))
synchronize(backend())
Expand Down
Loading