diff --git a/Project.toml b/Project.toml
index 2234b0b..624ae70 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ HSARuntime = "2c364e2c-59fb-59c3-96f3-194112e690e0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [compat]
diff --git a/src/AMDGPUnative.jl b/src/AMDGPUnative.jl
index 4c518b0..a12c8b8 100644
--- a/src/AMDGPUnative.jl
+++ b/src/AMDGPUnative.jl
@@ -7,6 +7,16 @@ using Adapt
 using TimerOutputs
 using DataStructures
 using Libdl
+using Requires
+
+@enum DeviceRuntime HSA OCL
+const RUNTIME = Ref{DeviceRuntime}(HSA)
+#=
+if get(ENV, "AMDGPUNATIVE_OPENCL", "") != ""
+    RUNTIME[] = OCL
+end
+=#
+include("runtime.jl")
 
 const configured = HSARuntime.configured
 
@@ -29,6 +39,7 @@ include("reflection.jl")
 
 function __init__()
     check_deps()
+    @require OpenCL="08131aa3-fb12-5dee-8b74-c09406e224a2" include("opencl.jl")
     __init_compiler__()
 end
 
diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 00b5630..86c5067 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -4,7 +4,7 @@ struct CompilerJob
     # core invocation
     f::Base.Callable
     tt::DataType
-    agent::HSAAgent
+    device::RuntimeDevice
     kernel::Bool
 
     # optional properties
@@ -14,10 +14,10 @@ struct CompilerJob
     maxregs::Union{Nothing,Integer}
     name::Union{Nothing,String}
 
-    CompilerJob(f, tt, agent, kernel; name=nothing,
+    CompilerJob(f, tt, device, kernel; name=nothing,
                 minthreads=nothing, maxthreads=nothing,
                 blocks_per_sm=nothing, maxregs=nothing) =
-        new(f, tt, agent, kernel, minthreads, maxthreads, blocks_per_sm,
+        new(f, tt, device, kernel, minthreads, maxthreads, blocks_per_sm,
             maxregs, name)
 end
 
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 52d7ab4..ceeb10d 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -4,11 +4,11 @@
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(target::Symbol, agent::HSAAgent, f, tt, kernel=true;
+    compile(target::Symbol, device::RuntimeDevice, f, tt, kernel=true;
             libraries=true, optimize=true, strip=false, strict=true, ...)
-Compile a function `f` invoked with types `tt` for agent `agent` to one of the
-following formats as specified by the `target` argument: `:julia` for Julia
-IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked
+Compile a function `f` invoked with types `tt` for device `device` to one of
+the following formats as specified by the `target` argument: `:julia` for
+Julia IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked
 objects. If the `kernel` flag is set, specialized code generation and
 optimization for kernel functions is enabled.
 The following keyword arguments are supported:
@@ -18,11 +18,11 @@ The following keyword arguments are supported:
 - `strict`: perform code validation either as early or as late as possible
 Other keyword arguments can be found in the documentation of [`rocfunction`](@ref).
 """
-compile(target::Symbol, agent::HSAAgent, @nospecialize(f::Core.Function),
+compile(target::Symbol, device::RuntimeDevice, @nospecialize(f::Core.Function),
                  @nospecialize(tt), kernel::Bool=true; libraries::Bool=true,
                  optimize::Bool=true, strip::Bool=false, strict::Bool=true, kwargs...) =
 
-                 compile(target, CompilerJob(f, tt, agent, kernel; kwargs...);
+                 compile(target, CompilerJob(f, tt, device, kernel; kwargs...);
                          libraries=libraries, optimize=optimize, strip=strip,
                          strict=strict)
 
@@ -88,7 +88,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
     # always preload the runtime, and do so early; it cannot be part of any timing block
     # because it recurses into the compiler
     if libraries
-        runtime = load_runtime(job.agent)
+        runtime = load_runtime(job.device)
         runtime_fns = LLVM.name.(defs(runtime))
     end
 
@@ -105,7 +105,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
             end
             =#
             # FIXME: Load this only when needed
-            device_libs = load_device_libs(job.agent)
+            device_libs = load_device_libs(job.device)
             for lib in device_libs
                 if need_library(ir, lib)
                     @timeit to[] "device library" link_device_lib!(job, ir, lib)
diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl
index f7285a4..428c3c5 100644
--- a/src/compiler/mcgen.jl
+++ b/src/compiler/mcgen.jl
@@ -1,12 +1,12 @@
 # machine code generation
 
-function machine(agent::HSAAgent, triple::String)
+function machine(device::RuntimeDevice, triple::String)
     InitializeAMDGPUTarget()
     InitializeAMDGPUTargetInfo()
     t = Target(triple)
 
     InitializeAMDGPUTargetMC()
-    cpu = get_first_isa(agent) # TODO: Make this configurable
+    cpu = default_isa(device) # TODO: Make this configurable
     feat = ""
     tm = TargetMachine(t, triple, cpu, feat)
     asm_verbosity!(tm, true)
@@ -80,7 +80,7 @@ end
 
 function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function;
                output_format=LLVM.API.LLVMObjectFile)
-    tm = machine(job.agent, triple(mod))
+    tm = machine(job.device, triple(mod))
 
     InitializeAMDGPUAsmPrinter()
     return String(emit(tm, mod, output_format))
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index a777842..b04f959 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1,7 +1,7 @@
 # LLVM IR optimization
 
 function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
-    tm = AMDGPUnative.machine(job.agent, triple(mod))
+    tm = AMDGPUnative.machine(job.device, triple(mod))
 
     if job.kernel
         entry = promote_kernel!(job, mod, entry)
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 0661523..ec0b5d9 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -30,10 +30,10 @@ const libcache = Dict{String, LLVM.Module}()
 
 # ROCm device library
 
-function load_device_libs(agent)
+function load_device_libs(device)
     device_libs_path === nothing && return
 
-    isa_short = replace(get_first_isa(agent), "gfx"=>"")
+    isa_short = replace(default_isa(device), "gfx"=>"")
     device_libs = LLVM.Module[]
     bitcode_files = (
         "hc.amdgcn.bc",
@@ -132,20 +132,20 @@ end
 
 ## functionality to build the runtime library
 
-function emit_function!(mod, agent, f, types, name)
+function emit_function!(mod, device, f, types, name)
     tt = Base.to_tuple_type(types)
-    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, agent, #=kernel=# false);
+    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, device, #=kernel=# false);
                              libraries=false, strict=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
 
-function build_runtime(agent)
+function build_runtime(device)
     mod = LLVM.Module("AMDGPUnative run-time library", JuliaContext())
 
     for method in values(Runtime.methods)
         try
-            emit_function!(mod, agent, method.def, method.types, method.llvm_name)
+            emit_function!(mod, device, method.def, method.types, method.llvm_name)
         catch err
             @warn method
         end
@@ -154,8 +154,8 @@ function build_runtime(agent)
     mod
 end
 
-function load_runtime(agent::HSAAgent)
-    isa = get_first_isa(agent)
+function load_runtime(device::RuntimeDevice)
+    isa = default_isa(device)
     name = "amdgpunative.$isa.bc"
     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
     mkpath(dirname(path))
@@ -167,7 +167,7 @@ function load_runtime(agent::HSAAgent)
             end
         else
             @info "Building the AMDGPUnative run-time library for your $isa device, this might take a while..."
-            lib = build_runtime(agent)
+            lib = build_runtime(device)
             open(path, "w") do io
                 write(io, lib)
             end
diff --git a/src/execution.jl b/src/execution.jl
index 6fa9e9c..b1662e4 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -3,7 +3,7 @@
 export @roc, rocconvert, rocfunction
 
 struct Kernel{F,TT}
-    agent::HSAAgent
+    device::RuntimeDevice
     mod::ROCModule
     fun::ROCFunction
 end
@@ -11,8 +11,9 @@ end
 # `split_kwargs()` segregates keyword arguments passed to `@roc` into those
 # affecting the compiler, kernel execution, or both.
 function split_kwargs(kwargs)
-    compiler_kws = [:agent, :queue, :name]
-    call_kws     = [:groupsize, :gridsize, :agent, :queue]
+    # TODO: Alias groupsize and gridsize as threads and blocks, respectively
+    compiler_kws = [:device, :agent, :queue, :name]
+    call_kws     = [:groupsize, :gridsize, :device, :agent, :queue]
     compiler_kwargs = []
     call_kwargs = []
     for kwarg in kwargs
@@ -60,6 +61,23 @@ function assign_args!(code, args)
     return vars, var_exprs
 end
 
+function extract_device(;device=nothing, agent=nothing, kwargs...)
+    if device !== nothing
+        return device
+    elseif agent !== nothing
+        return agent
+    else
+        return default_device()
+    end
+end
+function extract_queue(device; queue=nothing, kwargs...)
+    if queue !== nothing
+        return queue
+    else
+        return default_queue(device)
+    end
+end
+
 # fast lookup of global world age
 world_age() = ccall(:jl_get_tls_world_age, UInt, ())
 
@@ -125,11 +143,11 @@ macro roc(ex...)
             GC.@preserve $(vars...) begin
                 local kernel_args = map(rocconvert, ($(var_exprs...),))
                 local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                local agent = get_default_agent()
-                local kernel = rocfunction(agent, $(esc(f)), kernel_tt;
+                local device = extract_device(; $(map(esc, call_kwargs)...))
+                local kernel = rocfunction(device, $(esc(f)), kernel_tt;
                                            $(map(esc, compiler_kwargs)...))
-                local queue = get_default_queue(agent)
-                local signal = HSASignal()
+                local queue = extract_queue(device; $(map(esc, call_kwargs)...))
+                local signal = create_event()
                 kernel(queue, signal, kernel_args...; $(map(esc, call_kwargs)...))
                 wait(signal)
             end
@@ -188,7 +206,7 @@ The output of this function is automatically cached, i.e. you can simply call
 generated automatically, when the function changes, or when different types or
 keyword arguments are provided.
 """
-@generated function rocfunction(agent::HSAAgent, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...)
+@generated function rocfunction(device::RuntimeDevice, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...)
     tt = Base.to_tuple_type(tt.parameters[1])
     sig = Base.signature_type(f, tt)
     t = Tuple(tt.parameters)
@@ -217,8 +235,8 @@ keyword arguments are provided.
 
         # compile the function
         if !haskey(compilecache, key)
-            fun, mod = compile(:roc, agent, f, tt; name=name, kwargs...)
-            kernel = Kernel{f,tt}(agent, mod, fun)
+            fun, mod = compile(:roc, device, f, tt; name=name, kwargs...)
+            kernel = Kernel{f,tt}(device, mod, fun)
             compilecache[key] = kernel
         end
 
@@ -227,10 +245,10 @@ keyword arguments are provided.
 end
 
 rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) =
-    rocfunction(get_default_agent(), f, tt; kwargs...)
+    rocfunction(default_device(), f, tt; kwargs...)
 
-@generated function call(kernel::Kernel{F,TT}, queue::HSAQueue,
-                         signal::HSASignal, args...; call_kwargs...) where {F,TT}
+@generated function call(kernel::Kernel{F,TT}, queue::RuntimeQueue,
+                         signal::RuntimeEvent, args...; call_kwargs...) where {F,TT}
 
     sig = Base.signature_type(F, TT)
     args = (:F, (:( args[$i] ) for i in 1:length(args))...)
diff --git a/src/execution_utils.jl b/src/execution_utils.jl
index 1207971..da09381 100644
--- a/src/execution_utils.jl
+++ b/src/execution_utils.jl
@@ -51,7 +51,7 @@ function Base.getindex(dims::ROCDim3, idx::Int)
 end
 
 """
-    launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+    launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction,
            groupsize::ROCDim, gridsize::ROCDim, args...)
 
 Low-level call to launch a ROC function `f` on the GPU, using `groupsize` and
@@ -63,7 +63,7 @@ copied to the internal kernel parameter buffer, or a pointer to device memory.
 
 This is a low-level call, preferably use [`roccall`](@ref) instead.
 """
-@inline function launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+@inline function launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction,
                         groupsize::ROCDim, gridsize::ROCDim, args...)
     groupsize = ROCDim3(groupsize)
     gridsize = ROCDim3(gridsize)
@@ -77,7 +77,7 @@ end
 
 # we need a generated function to get an args array,
 # without having to inspect the types at runtime
-@generated function _launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+@generated function _launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction,
                             groupsize::ROCDim3, gridsize::ROCDim3,
                             args::NTuple{N,Any}) where N
 
@@ -101,23 +101,13 @@ end
         GC.@preserve $(arg_refs...) begin
             kernelParams = [$(arg_ptrs...)]
 
-            # link with ld.lld
-            ld_path = HSARuntime.ld_lld_path
-            @assert ld_path != "" "ld.lld was not found; cannot link kernel"
-            # TODO: Do this more idiomatically
-            io = open("/tmp/amdgpu-dump.o", "w")
-            write(io, f.mod.data)
-            close(io)
-            run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`)
-            io = open("/tmp/amdgpu.exe", "r")
-            data = read(io)
-            close(io)
-
-            # generate executable and kernel instance
-            exe = HSAExecutable(queue.agent, data, f.entry)
-            kern = HSAKernelInstance(queue.agent, exe, f.entry, args)
-            HSARuntime.launch!(queue, kern, signal;
-                workgroup_size=groupsize, grid_size=gridsize)
+            # create executable and kernel instance
+            exe = create_executable(get_device(queue), f)
+            kern = create_kernel(get_device(queue), exe, f.entry, args)
+
+            # launch kernel
+            launch_kernel(queue, kern, signal;
+                          groupsize=groupsize, gridsize=gridsize)
         end
     end).args)
 
@@ -125,7 +115,7 @@ end
 end
 
 """
-    roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types, values...;
+    roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, types, values...;
              groupsize::ROCDim, gridsize::ROCDim)
 
 `ccall`-like interface for launching a ROC function `f` on a GPU.
@@ -151,14 +141,14 @@ being slightly faster.
 """
 roccall
 
-@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N};
+@inline function roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N};
                           kwargs...) where N
     # this cannot be inferred properly (because types only contains `DataType`s),
     # which results in the call `@generated _roccall` getting expanded upon first use
     _roccall(queue, signal, f, Tuple{types...}, values; kwargs...)
 end
 
-@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, values::Vararg{Any,N};
+@inline function roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, tt::Type, values::Vararg{Any,N};
                           kwargs...) where N
     # in this case, the type of `tt` is `Tuple{<:DataType,...}`,
     # which means the generated function can be expanded earlier
@@ -167,7 +157,7 @@ end
 
 # we need a generated function to get a tuple of converted arguments (using unsafe_convert),
 # without having to inspect the types at runtime
-@generated function _roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, args::NTuple{N,Any};
+@generated function _roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, tt::Type, args::NTuple{N,Any};
                               groupsize::ROCDim=1, gridsize::ROCDim=1) where N
 
     # the type of `tt` is Type{Tuple{<:DataType...}}
diff --git a/src/opencl.jl b/src/opencl.jl
new file mode 100644
index 0000000..a3bfff2
--- /dev/null
+++ b/src/opencl.jl
@@ -0,0 +1,3 @@
+# OpenCL runtime interface to AMDGPUnative
+
+include("opencl/args.jl")
diff --git a/src/opencl/args.jl b/src/opencl/args.jl
new file mode 100644
index 0000000..5b58185
--- /dev/null
+++ b/src/opencl/args.jl
@@ -0,0 +1,12 @@
+# OpenCL argument utilities
+
+# Argument accessors
+
+# __agocl_global_offset_x - OpenCL Global Offset X
+# __agocl_global_offset_y - OpenCL Global Offset Y
+# __agocl_global_offset_z - OpenCL Global Offset Z
+# __agocl_printf_addr - OpenCL address of printf buffer
+# __agocl_queue_addr - OpenCL address of virtual queue used by enqueue_kernel
+# __agocl_aqlwrap_addr - OpenCL address of AqlWrap struct used by enqueue_kernel
+# __agocl_multigrid - Pointer argument used for Multi-grid synchronization
+
diff --git a/src/reflection.jl b/src/reflection.jl
index 5baeef5..8b8daa3 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -7,7 +7,7 @@ using InteractiveUtils
 #   and/or to support generating otherwise invalid code (e.g. with missing symbols).
 
 """
-    code_llvm([io], f, types; optimize=true, agent::HSAAgent=get_default_agent(), kernel=false,
+    code_llvm([io], f, types; optimize=true, device::RuntimeDevice=default_device(), kernel=false,
               optimize=true, raw=false, dump_module=false, strict=false)
 
 Prints the device LLVM IR generated for the method matching the given generic function and
@@ -15,7 +15,7 @@ type signature to `io` which defaults to `stdout`.
 
 The following keyword arguments are supported:
 
-- `agent`: which device to generate code for
+- `device`: which device to generate code for
 - `kernel`: treat the function as an entry-point kernel
 - `optimize`: determines if the code is optimized, which includes kernel-specific
   optimizations if `kernel` is true
@@ -26,11 +26,11 @@ The following keyword arguments are supported:
 See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref)
 """
 function code_llvm(io::IO, @nospecialize(func), @nospecialize(types);
-                   optimize::Bool=true, agent::HSAAgent=get_default_agent(),
+                   optimize::Bool=true, device::RuntimeDevice=default_device(),
                    dump_module::Bool=false, raw::Bool=false,
                    kernel::Bool=false, strict::Bool=false, kwargs...)
     tt = Base.to_tuple_type(types)
-    job = CompilerJob(func, tt, agent, kernel; kwargs...)
+    job = CompilerJob(func, tt, device, kernel; kwargs...)
     code_llvm(io, job; optimize=optimize,
               raw=raw, dump_module=dump_module, strict=strict)
 end
@@ -49,14 +49,14 @@ code_llvm(@nospecialize(func), @nospecialize(types); kwargs...) =
     code_llvm(stdout, func, types; kwargs...)
 
 """
-    code_gcn([io], f, types; agent::HSAAgent=get_default_agent(), kernel=false, raw=false, strict=false)
+    code_gcn([io], f, types; device::RuntimeDevice=default_device(), kernel=false, raw=false, strict=false)
 
 Prints the GCN assembly generated for the method matching the given generic function and
 type signature to `io` which defaults to `stdout`.
 
 The following keyword arguments are supported:
 
-- `agent`: which device to generate code for
+- `device`: which device to generate code for
 - `kernel`: treat the function as an entry-point kernel
 - `raw`: return the raw code including all metadata
 - `strict`: verify generate code as early as possible
@@ -64,10 +64,10 @@ The following keyword arguments are supported:
 See also: [`@device_code_gcn`](@ref)
 """
 function code_gcn(io::IO, @nospecialize(func), @nospecialize(types);
-                  agent::HSAAgent=get_default_agent(), kernel::Bool=false,
+                  device::RuntimeDevice=default_device(), kernel::Bool=false,
                   raw::Bool=false, strict::Bool=false, kwargs...)
     tt = Base.to_tuple_type(types)
-    job = CompilerJob(func, tt, agent, kernel; kwargs...)
+    job = CompilerJob(func, tt, device, kernel; kwargs...)
     code_gcn(io, job; raw=raw, strict=strict)
 end
 
diff --git a/src/runtime.jl b/src/runtime.jl
new file mode 100644
index 0000000..616f5c3
--- /dev/null
+++ b/src/runtime.jl
@@ -0,0 +1,58 @@
+struct RuntimeDevice{D}
+    device::D
+end
+default_device() = RuntimeDevice(default_device(RUNTIME[]))
+default_device(::typeof(HSA)) = HSARuntime.get_default_agent()
+
+struct RuntimeQueue{Q}
+    queue::Q
+end
+default_queue(device) = RuntimeQueue(default_queue(RUNTIME[], device))
+default_queue(::typeof(HSA), device) =
+    HSARuntime.get_default_queue(device.device)
+get_device(queue::RuntimeQueue{HSAQueue}) = RuntimeDevice(queue.queue.agent)
+
+default_isa(device::RuntimeDevice{HSAAgent}) =
+    HSARuntime.get_first_isa(device.device)
+
+struct RuntimeEvent{E}
+    event::E
+end
+create_event() = RuntimeEvent(create_event(RUNTIME[]))
+create_event(::typeof(HSA)) = HSASignal()
+Base.wait(event::RuntimeEvent) = wait(event.event)
+
+struct RuntimeExecutable{E}
+    exe::E
+end
+create_executable(device, func) =
+    RuntimeExecutable(create_executable(RUNTIME[], device, func))
+function create_executable(::typeof(HSA), device, func)
+    # link with ld.lld
+    ld_path = HSARuntime.ld_lld_path
+    @assert ld_path != "" "ld.lld was not found; cannot link kernel"
+    # TODO: Do this more idiomatically
+    io = open("/tmp/amdgpu-dump.o", "w")
+    write(io, func.mod.data)
+    close(io)
+    run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`)
+    io = open("/tmp/amdgpu.exe", "r")
+    data = read(io)
+    close(io)
+
+    return HSAExecutable(device.device, data, func.entry)
+end
+
+struct RuntimeKernel{K}
+    kernel::K
+end
+create_kernel(device, exe, entry, args) =
+    RuntimeKernel(create_kernel(RUNTIME[], device, exe, entry, args))
+create_kernel(::typeof(HSA), device, exe, entry, args) =
+    HSAKernelInstance(device.device, exe.exe, entry, args)
+launch_kernel(queue, kern, event; kwargs...) =
+    launch_kernel(RUNTIME[], queue, kern, event; kwargs...)
+launch_kernel(::typeof(HSA), queue, kern, event;
+              groupsize=nothing, gridsize=nothing) =
+    HSARuntime.launch!(queue.queue, kern.kernel, event.event;
+                       workgroup_size=groupsize, grid_size=gridsize)