JuliaGPU · vchuravy · Feb 13, 2025 · Feb 7, 2025
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -38,6 +38,87 @@ Major refactor of KernelAbstractions. In particular:
 - Removal of the event system. Kernel are now implicitly ordered.
 - Removal of backend packages, backends are now directly provided by CUDA.jl and similar
 
+#### 0.9.33
+Restricts the semantics of `@synchronize` to require convergent execution.
+The OpenCL backend had several miss-compilations due to divergent execution of `@synchronize`.
+The `CPU` backend always had this limitation and upon investigation the CUDA backend similarly requires convergent execution,
+but allows for a wider set of valid kernels.
+
+This highlighted a design flaw in KernelAbstractions. Most GPU implementations execute KernelAbstraction workgroups on static blocks
+This means a kernel with `ndrange=(32, 30)` might be executed on a static block of `(32,32)`. In order to block these extra indicies,
+KernelAbstraction would insert a dynamic boundscheck.
+
+Prior to v0.9.33 a kernel like
+
+```julia
+@kernel function localmem(A)
+    N = @uniform prod(@groupsize())
+    I = @index(Global, Linear)
+    i = @index(Local, Linear)
+    lmem = @localmem Int (N,) # Ok iff groupsize is static
+    lmem[i] = i
+    @synchronize
+    A[I] = lmem[N - i + 1]
+end
+```
+
+was lowered to GPU backends like this:
+
+```julia
+function localmem_gpu(A)
+    if __validindex(__ctx__)
+        N = @uniform prod(@groupsize())
+        I = @index(Global, Linear)
+        i = @index(Local, Linear)
+        lmem = @localmem Int (N,) # Ok iff groupsize is static
+        lmem[i] = i
+        @synchronize
+        A[I] = lmem[N - i + 1]
+    end
+end
+```
+
+This would cause an implicit divergent execution of `@synchronize`. 
+
+With this release the lowering has been changed to:
+
+```julia
+function localmem_gpu(A)
+    __valid_lane__ __validindex(__ctx__)
+    N = @uniform prod(@groupsize())
+    lmem = @localmem Int (N,) # Ok iff groupsize is static
+    if __valid_lane__
+        I = @index(Global, Linear)
+        i = @index(Local, Linear)
+        lmem[i] = i
+    end
+    @synchronize
+    if __valid_lane__
+        A[I] = lmem[N - i + 1]
+    end
+end
+```
+
+Note that this follow the CPU lowering with respect to `@uniform`, `@private`, `@localmem` and `@synchronize`.
+
+Since this transformation can be disruptive, user can now opt out of the implicit bounds-check,
+but users must avoid the use of `@index(Global)` and instead use their own derivation based on `@index(Group)` and `@index(Local)`.
+
+```julia
+@kernel unsafe_indicies=false function localmem(A)
+    N = @uniform prod(@groupsize())
+    gI = @index(Group, Linear)
+    i = @index(Local, Linear)
+    lmem = @localmem Int (N,) # Ok iff groupsize is static
+    lmem[i] = i
+    @synchronize
+    I = (gI - 1) * N + i
+    if i <= N && I <= length(A)
+        A[I] = lmem[N - i + 1]
+    end
+end
+```
+
 ## Semantic differences
 
 ### To CUDA.jl/AMDGPU.jl

diff --git a/src/KernelAbstractions.jl b/src/KernelAbstractions.jl
@@ -50,7 +50,7 @@ synchronize(backend)
 ```
 """
 macro kernel(expr)
-    return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false)
+    return __kernel(expr, #=generate_cpu=# true, #=force_inbounds=# false, #=unsafe_indicies=# false)
 end
 
 """
@@ -60,6 +60,7 @@ This allows for two different configurations:
 
 1. `cpu={true, false}`: Disables code-generation of the CPU function. This relaxes semantics such that KernelAbstractions primitives can be used in non-kernel functions.
 2. `inbounds={false, true}`: Enables a forced `@inbounds` macro around the function definition in the case the user is using too many `@inbounds` already in their kernel. Note that this can lead to incorrect results, crashes, etc and is fundamentally unsafe. Be careful!
+3. `unsafe_indicies={false, true}`: Disables the implicit validation of indicies, users must avoid `@index(Global)`.
 
 - [`@context`](@ref)
 
@@ -68,9 +69,10 @@ This allows for two different configurations:
 """
 macro kernel(ex...)
     if length(ex) == 1
-        return __kernel(ex[1], true, false)
+        return __kernel(ex[1], true, false, false)
     else
         generate_cpu = true
+        unsafe_indicies = false
         force_inbounds = false
         for i in 1:(length(ex) - 1)
             if ex[i] isa Expr && ex[i].head == :(=) &&
@@ -79,16 +81,20 @@ macro kernel(ex...)
             elseif ex[i] isa Expr && ex[i].head == :(=) &&
                     ex[i].args[1] == :inbounds && ex[i].args[2] isa Bool
                 force_inbounds = ex[i].args[2]
+            elseif ex[i] isa Expr && ex[i].head == :(=) &&
+                    ex[i].args[1] == :unsafe_indicies && ex[i].args[2] isa Bool
+                unsafe_indicies = ex[i].args[2]
             else
                 error(
                     "Configuration should be of form:\n" *
                         "* `cpu=true`\n" *
                         "* `inbounds=false`\n" *
+                        "* `unsafe_indicies=false`\n" *
                         "got `", ex[i], "`",
                 )
             end
         end
-        return __kernel(ex[end], generate_cpu, force_inbounds)
+        return __kernel(ex[end], generate_cpu, force_inbounds, unsafe_indicies)
     end
 end
 

diff --git a/src/macros.jl b/src/macros.jl
@@ -10,7 +10,7 @@ function find_return(stmt)
 end
 
 # XXX: Proper errors
-function __kernel(expr, generate_cpu = true, force_inbounds = false)
+function __kernel(expr, generate_cpu = true, force_inbounds = false, unsafe_indicies = true)
     def = splitdef(expr)
     name = def[:name]
     args = def[:args]
@@ -46,7 +46,7 @@ function __kernel(expr, generate_cpu = true, force_inbounds = false)
 
     def_gpu = deepcopy(def)
     def_gpu[:name] = gpu_name = Symbol(:gpu_, name)
-    transform_gpu!(def_gpu, constargs, force_inbounds)
+    transform_gpu!(def_gpu, constargs, force_inbounds, unsafe_indicies)
     gpu_function = combinedef(def_gpu)
 
     # create constructor functions
@@ -78,7 +78,7 @@ end
 
 # The easy case, transform the function for GPU execution
 # - mark constant arguments by applying `constify`.
-function transform_gpu!(def, constargs, force_inbounds)
+function transform_gpu!(def, constargs, force_inbounds, unsafe_indicies)
     let_constargs = Expr[]
     for (i, arg) in enumerate(def[:args])
         if constargs[i]
@@ -94,7 +94,11 @@ function transform_gpu!(def, constargs, force_inbounds)
     if force_inbounds
         push!(new_stmts, Expr(:inbounds, true))
     end
-    append!(new_stmts, split(emit_gpu, body.args))
+    if !unsafe_indicies
+        append!(new_stmts, split(emit_gpu, body.args))
+    else
+        push!(new_stmts, body)
+    end
     if force_inbounds
         push!(new_stmts, Expr(:inbounds, :pop))
     end

diff --git a/test/localmem.jl b/test/localmem.jl
@@ -34,9 +34,22 @@ end
     end
 end
 
+@kernel unsafe_indicies = false function localmem_unsafe_indicies(A)
+    N = @uniform prod(@groupsize())
+    gI = @index(Group, Linear)
+    i = @index(Local, Linear)
+    lmem = @localmem Int (N,) # Ok iff groupsize is static
+    lmem[i] = i
+    @synchronize
+    I = (gI - 1) * N + i
+    if I <= length(A)
+        A[I] = lmem[N - i + 1]
+    end
+end
+
 function localmem_testsuite(backend, ArrayT)
     @testset "kernels" begin
-        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16))
+        @testset for kernel! in (localmem(backend(), 16), localmem2(backend(), 16), localmem_unsafe_indicies(backend(), 16))
             A = ArrayT{Int}(undef, 64)
             kernel!(A, ndrange = size(A))
             synchronize(backend())