diff --git a/Project.toml b/Project.toml
index 2234b0b..b5c0bfd 100644
--- a/Project.toml
+++ b/Project.toml
@@ -10,6 +10,7 @@ HSARuntime = "2c364e2c-59fb-59c3-96f3-194112e690e0"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+Requires = "ae029012-a4dd-5104-9daa-d747884805df"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [compat]
@@ -22,8 +23,10 @@ TimerOutputs = "0.5"
 julia = "1"
 
 [extras]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["SpecialFunctions", "Test"]
+test = ["LinearAlgebra", "OpenCL", "SpecialFunctions", "Test"]
diff --git a/src/AMDGPUnative.jl b/src/AMDGPUnative.jl
index 4c518b0..83047ed 100644
--- a/src/AMDGPUnative.jl
+++ b/src/AMDGPUnative.jl
@@ -7,6 +7,14 @@ using Adapt
 using TimerOutputs
 using DataStructures
 using Libdl
+using Requires
+
+@enum DeviceRuntime HSA OCL
+const RUNTIME = Ref{DeviceRuntime}(HSA)
+if get(ENV, "AMDGPUNATIVE_OPENCL", "") != ""
+    RUNTIME[] = OCL
+end
+include("runtime.jl")
 
 const configured = HSARuntime.configured
 
@@ -21,6 +29,7 @@ include(joinpath("device", "pointer.jl"))
 include(joinpath("device", "array.jl"))
 include(joinpath("device", "gcn.jl"))
 include(joinpath("device", "runtime.jl"))
+include(joinpath("device", "llvm.jl"))
 
 include("execution_utils.jl")
 include("compiler.jl")
@@ -29,6 +38,7 @@ include("reflection.jl")
 
 function __init__()
     check_deps()
+    @require OpenCL="08131aa3-fb12-5dee-8b74-c09406e224a2" include("opencl.jl")
     __init_compiler__()
 end
 
diff --git a/src/compiler/common.jl b/src/compiler/common.jl
index 00b5630..86c5067 100644
--- a/src/compiler/common.jl
+++ b/src/compiler/common.jl
@@ -4,7 +4,7 @@ struct CompilerJob
     # core invocation
     f::Base.Callable
     tt::DataType
-    agent::HSAAgent
+    device::RuntimeDevice
     kernel::Bool
 
     # optional properties
@@ -14,10 +14,10 @@ struct CompilerJob
     maxregs::Union{Nothing,Integer}
     name::Union{Nothing,String}
 
-    CompilerJob(f, tt, agent, kernel; name=nothing,
+    CompilerJob(f, tt, device, kernel; name=nothing,
                 minthreads=nothing, maxthreads=nothing,
                 blocks_per_sm=nothing, maxregs=nothing) =
-        new(f, tt, agent, kernel, minthreads, maxthreads, blocks_per_sm,
+        new(f, tt, device, kernel, minthreads, maxthreads, blocks_per_sm,
             maxregs, name)
 end
 
diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl
index 52d7ab4..ceeb10d 100644
--- a/src/compiler/driver.jl
+++ b/src/compiler/driver.jl
@@ -4,11 +4,11 @@
 const compile_hook = Ref{Union{Nothing,Function}}(nothing)
 
 """
-    compile(target::Symbol, agent::HSAAgent, f, tt, kernel=true;
+    compile(target::Symbol, device::RuntimeDevice, f, tt, kernel=true;
             libraries=true, optimize=true, strip=false, strict=true, ...)
-Compile a function `f` invoked with types `tt` for agent `agent` to one of the
-following formats as specified by the `target` argument: `:julia` for Julia
-IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked
+Compile a function `f` invoked with types `tt` for device `device` to one of
+the following formats as specified by the `target` argument: `:julia` for
+Julia IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked
 objects. If the `kernel` flag is set, specialized code generation and
 optimization for kernel functions is enabled.
 The following keyword arguments are supported:
@@ -18,11 +18,11 @@ The following keyword arguments are supported:
 - `strict`: perform code validation either as early or as late as possible
 Other keyword arguments can be found in the documentation of [`rocfunction`](@ref).
 """
-compile(target::Symbol, agent::HSAAgent, @nospecialize(f::Core.Function),
+compile(target::Symbol, device::RuntimeDevice, @nospecialize(f::Core.Function),
                  @nospecialize(tt), kernel::Bool=true; libraries::Bool=true,
                  optimize::Bool=true, strip::Bool=false, strict::Bool=true, kwargs...) =
 
-                 compile(target, CompilerJob(f, tt, agent, kernel; kwargs...);
+                 compile(target, CompilerJob(f, tt, device, kernel; kwargs...);
                          libraries=libraries, optimize=optimize, strip=strip,
                          strict=strict)
 
@@ -88,7 +88,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
     # always preload the runtime, and do so early; it cannot be part of any timing block
     # because it recurses into the compiler
     if libraries
-        runtime = load_runtime(job.agent)
+        runtime = load_runtime(job.device)
         runtime_fns = LLVM.name.(defs(runtime))
     end
 
@@ -105,7 +105,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true,
             end
             =#
             # FIXME: Load this only when needed
-            device_libs = load_device_libs(job.agent)
+            device_libs = load_device_libs(job.device)
             for lib in device_libs
                 if need_library(ir, lib)
                     @timeit to[] "device library" link_device_lib!(job, ir, lib)
diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl
index f7285a4..428c3c5 100644
--- a/src/compiler/mcgen.jl
+++ b/src/compiler/mcgen.jl
@@ -1,12 +1,12 @@
 # machine code generation
 
-function machine(agent::HSAAgent, triple::String)
+function machine(device::RuntimeDevice, triple::String)
     InitializeAMDGPUTarget()
     InitializeAMDGPUTargetInfo()
     t = Target(triple)
 
     InitializeAMDGPUTargetMC()
-    cpu = get_first_isa(agent) # TODO: Make this configurable
+    cpu = default_isa(device) # TODO: Make this configurable
     feat = ""
     tm = TargetMachine(t, triple, cpu, feat)
     asm_verbosity!(tm, true)
@@ -80,7 +80,7 @@ end
 
 function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function;
                output_format=LLVM.API.LLVMObjectFile)
-    tm = machine(job.agent, triple(mod))
+    tm = machine(job.device, triple(mod))
 
     InitializeAMDGPUAsmPrinter()
     return String(emit(tm, mod, output_format))
diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl
index a777842..b04f959 100644
--- a/src/compiler/optim.jl
+++ b/src/compiler/optim.jl
@@ -1,7 +1,7 @@
 # LLVM IR optimization
 
 function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function)
-    tm = AMDGPUnative.machine(job.agent, triple(mod))
+    tm = AMDGPUnative.machine(job.device, triple(mod))
 
     if job.kernel
         entry = promote_kernel!(job, mod, entry)
diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl
index 0661523..ec0b5d9 100644
--- a/src/compiler/rtlib.jl
+++ b/src/compiler/rtlib.jl
@@ -30,10 +30,10 @@ const libcache = Dict{String, LLVM.Module}()
 
 # ROCm device library
 
-function load_device_libs(agent)
+function load_device_libs(device)
     device_libs_path === nothing && return
 
-    isa_short = replace(get_first_isa(agent), "gfx"=>"")
+    isa_short = replace(default_isa(device), "gfx"=>"")
     device_libs = LLVM.Module[]
     bitcode_files = (
         "hc.amdgcn.bc",
@@ -132,20 +132,20 @@ end
 
 ## functionality to build the runtime library
 
-function emit_function!(mod, agent, f, types, name)
+function emit_function!(mod, device, f, types, name)
     tt = Base.to_tuple_type(types)
-    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, agent, #=kernel=# false);
+    new_mod, entry = codegen(:llvm, CompilerJob(f, tt, device, #=kernel=# false);
                              libraries=false, strict=false)
     LLVM.name!(entry, name)
     link!(mod, new_mod)
 end
 
-function build_runtime(agent)
+function build_runtime(device)
     mod = LLVM.Module("AMDGPUnative run-time library", JuliaContext())
 
     for method in values(Runtime.methods)
         try
-            emit_function!(mod, agent, method.def, method.types, method.llvm_name)
+            emit_function!(mod, device, method.def, method.types, method.llvm_name)
         catch err
             @warn method
         end
@@ -154,8 +154,8 @@ function build_runtime(agent)
     mod
 end
 
-function load_runtime(agent::HSAAgent)
-    isa = get_first_isa(agent)
+function load_runtime(device::RuntimeDevice)
+    isa = default_isa(device)
     name = "amdgpunative.$isa.bc"
     path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name)
     mkpath(dirname(path))
@@ -167,7 +167,7 @@ function load_runtime(agent::HSAAgent)
             end
         else
             @info "Building the AMDGPUnative run-time library for your $isa device, this might take a while..."
-            lib = build_runtime(agent)
+            lib = build_runtime(device)
             open(path, "w") do io
                 write(io, lib)
             end
diff --git a/src/execution.jl b/src/execution.jl
index 6fa9e9c..d0e79b7 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -3,7 +3,7 @@
 export @roc, rocconvert, rocfunction
 
 struct Kernel{F,TT}
-    agent::HSAAgent
+    device::RuntimeDevice
     mod::ROCModule
     fun::ROCFunction
 end
@@ -11,8 +11,9 @@ end
 # `split_kwargs()` segregates keyword arguments passed to `@roc` into those
 # affecting the compiler, kernel execution, or both.
 function split_kwargs(kwargs)
-    compiler_kws = [:agent, :queue, :name]
-    call_kws     = [:groupsize, :gridsize, :agent, :queue]
+    # TODO: Alias groupsize and gridsize as threads and blocks, respectively
+    compiler_kws = [:device, :agent, :queue, :name]
+    call_kws     = [:groupsize, :gridsize, :device, :agent, :queue]
     compiler_kwargs = []
     call_kwargs = []
     for kwarg in kwargs
@@ -60,6 +61,23 @@ function assign_args!(code, args)
     return vars, var_exprs
 end
 
+function extract_device(;device=nothing, agent=nothing, kwargs...)
+    if device !== nothing
+        return device
+    elseif agent !== nothing
+        return agent
+    else
+        return default_device()
+    end
+end
+function extract_queue(device; queue=nothing, kwargs...)
+    if queue !== nothing
+        return queue
+    else
+        return default_queue(device)
+    end
+end
+
 # fast lookup of global world age
 world_age() = ccall(:jl_get_tls_world_age, UInt, ())
 
@@ -125,13 +143,12 @@ macro roc(ex...)
             GC.@preserve $(vars...) begin
                 local kernel_args = map(rocconvert, ($(var_exprs...),))
                 local kernel_tt = Tuple{Core.Typeof.(kernel_args)...}
-                local agent = get_default_agent()
-                local kernel = rocfunction(agent, $(esc(f)), kernel_tt;
+                local device = extract_device(; $(map(esc, call_kwargs)...))
+                local kernel = rocfunction(device, $(esc(f)), kernel_tt;
                                            $(map(esc, compiler_kwargs)...))
-                local queue = get_default_queue(agent)
-                local signal = HSASignal()
-                kernel(queue, signal, kernel_args...; $(map(esc, call_kwargs)...))
-                wait(signal)
+                local queue = extract_queue(device; $(map(esc, call_kwargs)...))
+                local event = kernel(queue, kernel_args...; $(map(esc, call_kwargs)...))
+                wait(event)
             end
         end)
     return code
@@ -188,7 +205,7 @@ The output of this function is automatically cached, i.e. you can simply call
 generated automatically, when the function changes, or when different types or
 keyword arguments are provided.
 """
-@generated function rocfunction(agent::HSAAgent, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...)
+@generated function rocfunction(device::RuntimeDevice, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...)
     tt = Base.to_tuple_type(tt.parameters[1])
     sig = Base.signature_type(f, tt)
     t = Tuple(tt.parameters)
@@ -217,8 +234,8 @@ keyword arguments are provided.
 
         # compile the function
         if !haskey(compilecache, key)
-            fun, mod = compile(:roc, agent, f, tt; name=name, kwargs...)
-            kernel = Kernel{f,tt}(agent, mod, fun)
+            fun, mod = compile(:roc, device, f, tt; name=name, kwargs...)
+            kernel = Kernel{f,tt}(device, mod, fun)
             compilecache[key] = kernel
         end
 
@@ -227,10 +244,10 @@ keyword arguments are provided.
 end
 
 rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) =
-    rocfunction(get_default_agent(), f, tt; kwargs...)
+    rocfunction(default_device(), f, tt; kwargs...)
 
-@generated function call(kernel::Kernel{F,TT}, queue::HSAQueue,
-                         signal::HSASignal, args...; call_kwargs...) where {F,TT}
+@generated function call(kernel::Kernel{F,TT}, queue::RuntimeQueue,
+                         args...; call_kwargs...) where {F,TT}
 
     sig = Base.signature_type(F, TT)
     args = (:F, (:( args[$i] ) for i in 1:length(args))...)
@@ -254,7 +271,7 @@ rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) =
 
     quote
         Base.@_inline_meta
-        roccall(queue, signal, kernel.fun, $call_tt, $(call_args...); call_kwargs...)
+        roccall(queue, kernel.fun, $call_tt, $(call_args...); call_kwargs...)
     end
 end
 
diff --git a/src/execution_utils.jl b/src/execution_utils.jl
index 1207971..b469687 100644
--- a/src/execution_utils.jl
+++ b/src/execution_utils.jl
@@ -50,20 +50,32 @@ function Base.getindex(dims::ROCDim3, idx::Int)
            error("Invalid dimension: $idx")
 end
 
+function link_kernel(func::ROCFunction)
+    # link with ld.lld
+    ld_path = HSARuntime.ld_lld_path
+    @assert ld_path != "" "ld.lld was not found; cannot link kernel"
+    mktemp() do objpath, objio
+        write(objio, func.mod.data)
+        flush(objio)
+        run(`$ld_path -shared -o $(objpath*".exe") $objpath`)
+        data = read(objpath*".exe")
+    end
+end
+
 """
-    launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+    launch(queue::RuntimeQueue, f::ROCFunction,
            groupsize::ROCDim, gridsize::ROCDim, args...)
 
 Low-level call to launch a ROC function `f` on the GPU, using `groupsize` and
 `gridsize` as the grid and block configuration, respectively. The kernel is
-launched on `queue` and is waited on by `signal`.
+launched on `queue` and returns a `RuntimeEvent`.
 
 Arguments to a kernel should either be bitstype, in which case they will be
 copied to the internal kernel parameter buffer, or a pointer to device memory.
 
 This is a low-level call, preferably use [`roccall`](@ref) instead.
 """
-@inline function launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+@inline function launch(queue::RuntimeQueue, f::ROCFunction,
                         groupsize::ROCDim, gridsize::ROCDim, args...)
     groupsize = ROCDim3(groupsize)
     gridsize = ROCDim3(gridsize)
@@ -72,12 +84,12 @@ This is a low-level call, preferably use [`roccall`](@ref) instead.
     (gridsize.x>0 && gridsize.y>0 && gridsize.z>0) ||
         throw(ArgumentError("Grid dimensions should be non-null"))
 
-    _launch(queue, signal, f, groupsize, gridsize, args...)
+    _launch(queue, f, groupsize, gridsize, args...)
 end
 
 # we need a generated function to get an args array,
 # without having to inspect the types at runtime
-@generated function _launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction,
+@generated function _launch(queue::RuntimeQueue, f::ROCFunction,
                             groupsize::ROCDim3, gridsize::ROCDim3,
                             args::NTuple{N,Any}) where N
 
@@ -101,23 +113,12 @@ end
         GC.@preserve $(arg_refs...) begin
             kernelParams = [$(arg_ptrs...)]
 
-            # link with ld.lld
-            ld_path = HSARuntime.ld_lld_path
-            @assert ld_path != "" "ld.lld was not found; cannot link kernel"
-            # TODO: Do this more idiomatically
-            io = open("/tmp/amdgpu-dump.o", "w")
-            write(io, f.mod.data)
-            close(io)
-            run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`)
-            io = open("/tmp/amdgpu.exe", "r")
-            data = read(io)
-            close(io)
-
-            # generate executable and kernel instance
-            exe = HSAExecutable(queue.agent, data, f.entry)
-            kern = HSAKernelInstance(queue.agent, exe, f.entry, args)
-            HSARuntime.launch!(queue, kern, signal;
-                workgroup_size=groupsize, grid_size=gridsize)
+            # create executable and kernel instance
+            exe = create_executable(get_device(queue), f)
+            kern = create_kernel(get_device(queue), exe, f.entry, args)
+
+            # launch kernel
+            launch_kernel(queue, kern; groupsize=groupsize, gridsize=gridsize)
         end
     end).args)
 
@@ -125,7 +126,7 @@ end
 end
 
 """
-    roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types, values...;
+    roccall(queue::RuntimeQueue, f::ROCFunction, types, values...;
              groupsize::ROCDim, gridsize::ROCDim)
 
 `ccall`-like interface for launching a ROC function `f` on a GPU.
@@ -151,24 +152,26 @@ being slightly faster.
 """
 roccall
 
-@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N};
-                          kwargs...) where N
+@inline function roccall(queue::RuntimeQueue, f::ROCFunction,
+                         types::NTuple{N,DataType}, values::Vararg{Any,N};
+                         kwargs...) where N
     # this cannot be inferred properly (because types only contains `DataType`s),
     # which results in the call `@generated _roccall` getting expanded upon first use
-    _roccall(queue, signal, f, Tuple{types...}, values; kwargs...)
+    _roccall(queue, f, Tuple{types...}, values; kwargs...)
 end
 
-@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, values::Vararg{Any,N};
-                          kwargs...) where N
+@inline function roccall(queue::RuntimeQueue, f::ROCFunction, tt::Type,
+                         values::Vararg{Any,N}; kwargs...) where N
     # in this case, the type of `tt` is `Tuple{<:DataType,...}`,
     # which means the generated function can be expanded earlier
-    _roccall(queue, signal, f, tt, values; kwargs...)
+    _roccall(queue, f, tt, values; kwargs...)
 end
 
 # we need a generated function to get a tuple of converted arguments (using unsafe_convert),
 # without having to inspect the types at runtime
-@generated function _roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, args::NTuple{N,Any};
-                              groupsize::ROCDim=1, gridsize::ROCDim=1) where N
+@generated function _roccall(queue::RuntimeQueue, f::ROCFunction, tt::Type,
+                             args::NTuple{N,Any};
+                             groupsize::ROCDim=1, gridsize::ROCDim=1) where N
 
     # the type of `tt` is Type{Tuple{<:DataType...}}
     types = tt.parameters[1].parameters
@@ -189,7 +192,7 @@ end
 
     append!(ex.args, (quote
         GC.@preserve $(converted_args...) begin
-            launch(queue, signal, f, groupsize, gridsize, ($(arg_ptrs...),))
+            launch(queue, f, groupsize, gridsize, ($(arg_ptrs...),))
         end
     end).args)
 
diff --git a/src/opencl.jl b/src/opencl.jl
new file mode 100644
index 0000000..2f85920
--- /dev/null
+++ b/src/opencl.jl
@@ -0,0 +1,101 @@
+# OpenCL runtime interface to AMDGPUnative
+using OpenCL
+#const cl = OpenCL.cl
+
+include("opencl/args.jl")
+include("opencl/buffer.jl")
+include("opencl/deviceptr.jl")
+
+const OCL_DEFAULT_DEVICE = Ref{cl.Device}()
+const OCL_DEFAULT_CONTEXT = IdDict{cl.Device,cl.Context}()
+const OCL_DEFAULT_CMDQUEUE = IdDict{cl.Device,cl.CmdQueue}()
+devs = cl.devices()
+if length(devs) > 0
+    dev = first(devs)
+    OCL_DEFAULT_DEVICE[] = dev
+    ctx = cl.Context(dev)
+    OCL_DEFAULT_CONTEXT[dev] = ctx
+    queue = cl.CmdQueue(ctx)
+    OCL_DEFAULT_CMDQUEUE[dev] = queue
+end
+
+default_device(::typeof(HSA)) = OCL_DEFAULT_DEVICE[]
+
+function default_queue(::typeof(HSA), device)
+    cldev = device.device
+    if !haskey(OCL_DEFAULT_CMDQUEUE, cldev)
+        if !haskey(OCL_DEFAULT_CONTEXT, cldev)
+            ctx = cl.Context(cldev)
+            OCL_DEFAULT_CONTEXT[cldev] = ctx
+        else
+            ctx = OCL_DEFAULT_CONTEXT[cldev]
+        end
+        queue = cl.CmdQueue(ctx)
+        OCL_DEFAULT_CMDQUEUE[cldev] = queue
+        return queue
+    else
+        return OCL_DEFAULT_CMDQUEUE[cldev]
+    end
+end
+get_device(queue::RuntimeQueue{cl.CmdQueue}) =
+    RuntimeDevice(queue.queue[:device])
+get_context(queue::cl.CmdQueue) = queue[:context]
+
+default_isa(device::RuntimeDevice{cl.Device}) =
+    device.device[:name]
+
+#create_event(device::RuntimeDevice{cl.Device}) =
+#    RuntimeEvent(cl.UserEvent(device.device))
+Base.wait(event::RuntimeEvent{cl.Event}) = cl.wait(event.event)
+
+function create_executable(::typeof(OCL), device, func)
+    data = link_kernel(func)
+    binaries = Dict{cl.Device, Array{UInt8}}()
+    binaries[device.device] = data
+    dev = device.device
+    ctx = OCL_DEFAULT_CONTEXT[dev]
+    return cl.Program(ctx, binaries=binaries) |> cl.build!
+end
+
+create_kernel(::typeof(HSA), device, exe, entry, args) =
+    cl.Kernel(exe.exe, entry)
+struct FakeDeviceArray
+    size::Int64
+    ptr::Int64
+end
+clconvert(q, x) = x
+function clconvert(q, x::ROCDeviceArray{T,N,A}) where {T,N,A}
+    #=
+    cl_mem cm_buffer;
+    cm_buffer = clCreateBuffer(cxContext, CL_MEM_READ_ONLY, sizeof(st_foo), NULL, NULL);
+    clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cm_buffer);
+    clEnqueueWriteBuffer(cqueue, cm_buffer, CL_TRUE, 0, sizeof(st_foo), &stVar, 0, NULL, NULL);
+    =#
+    @show x
+    #=
+    @show sizeof(x)
+    xref = Ref(x)
+    GC.@preserve xref begin
+        xptr = Base.unsafe_convert(Ptr{UInt32}, Base.pointer_from_objref(xref))
+        struct_size = div(sizeof(x),sizeof(UInt32))
+        @show struct_size
+        arr = unsafe_wrap(Array, xptr, (struct_size,))
+        @show arr
+        ctx = get_context(q)
+        buf = cl.Buffer(UInt32, ctx, (:rw, :copy); hostbuf=arr)
+        @show buf
+    end
+    return buf
+    =#
+    #return FakeDeviceArray(prod(x.shape), Int64(x.ptr))
+    return x
+end
+function launch_kernel(::typeof(HSA), queue, kern;
+                       groupsize=nothing, gridsize=nothing)
+    q = queue.queue
+    k = kern.kernel
+    args = map(arg->clconvert(q, arg), kern.args)
+    cl.set_args!(k, args...)
+    event = cl.enqueue_kernel(q, k, 1, 1) #=gridsize, groupsize,=#
+    return RuntimeEvent(event)
+end
diff --git a/src/opencl/args.jl b/src/opencl/args.jl
new file mode 100644
index 0000000..5b58185
--- /dev/null
+++ b/src/opencl/args.jl
@@ -0,0 +1,12 @@
+# OpenCL argument utilities
+
+# Argument accessors
+
+# __agocl_global_offset_x - OpenCL Global Offset X
+# __agocl_global_offset_y - OpenCL Global Offset Y
+# __agocl_global_offset_z - OpenCL Global Offset Z
+# __agocl_printf_addr - OpenCL address of printf buffer
+# __agocl_queue_addr - OpenCL address of virtual queue used by enqueue_kernel
+# __agocl_aqlwrap_addr - OpenCL address of AqlWrap struct used by enqueue_kernel
+# __agocl_multigrid - Pointer argument used for Multi-grid synchronization
+
diff --git a/src/opencl/buffer.jl b/src/opencl/buffer.jl
new file mode 100644
index 0000000..cfdcfe9
--- /dev/null
+++ b/src/opencl/buffer.jl
@@ -0,0 +1,9 @@
+## interop with cl.Buffer
+function Base.convert(::Type{ROCDeviceArray{T,N,AS.Global}}, a::cl.Buffer{T}) where {T,N}
+    dev_ptr = get_deviceptr(default_queue(default_device()).queue, a)
+    @show dev_ptr
+    ROCDeviceArray{T,N,AS.Global}((length(a),), DevicePtr{T,AS.Global}(dev_ptr))
+end
+
+Adapt.adapt_storage(::Adaptor, xs::cl.Buffer{T}) where T =
+    convert(ROCDeviceArray{T,ndims(xs),AS.Global}, xs)
diff --git a/src/opencl/deviceptr.jl b/src/opencl/deviceptr.jl
new file mode 100644
index 0000000..94e27a9
--- /dev/null
+++ b/src/opencl/deviceptr.jl
@@ -0,0 +1,27 @@
+const deviceptr_source = """
+__kernel void getPtr(__global ulong *ptr, __global ulong *out) {
+    out[0] = &ptr[0];
+}
+"""
+
+const DEVICEPTR_KERNELS = Dict{cl.Device,cl.Kernel}()
+
+function get_deviceptr(q::cl.CmdQueue, buf::cl.Buffer{T}) where T
+    ctx = q[:context]
+    dev = q[:device]
+
+    host_buf = cl.Buffer(UInt64, ctx, :rw, 1)
+    dev_buf = cl.Buffer(UInt64, ctx, :rw, 1)
+
+    if !haskey(DEVICEPTR_KERNELS, dev)
+        prog = cl.build!(cl.Program(ctx, source=deviceptr_source), options="-Werror")
+        kern = cl.Kernel(prog, "getPtr")
+        DEVICEPTR_KERNELS[dev] = kern
+    else
+        kern = DEVICEPTR_KERNELS[dev]
+    end
+
+    event = q(kern, 1, nothing, host_buf, dev_buf)
+    cl.wait(event)
+    return Ptr{T}(cl.read(q, dev_buf)[1])
+end
diff --git a/src/reflection.jl b/src/reflection.jl
index 5baeef5..8b8daa3 100644
--- a/src/reflection.jl
+++ b/src/reflection.jl
@@ -7,7 +7,7 @@ using InteractiveUtils
 #   and/or to support generating otherwise invalid code (e.g. with missing symbols).
 
 """
-    code_llvm([io], f, types; optimize=true, agent::HSAAgent=get_default_agent(), kernel=false,
+    code_llvm([io], f, types; optimize=true, device::RuntimeDevice=default_device(), kernel=false,
               optimize=true, raw=false, dump_module=false, strict=false)
 
 Prints the device LLVM IR generated for the method matching the given generic function and
@@ -15,7 +15,7 @@ type signature to `io` which defaults to `stdout`.
 
 The following keyword arguments are supported:
 
-- `agent`: which device to generate code for
+- `device`: which device to generate code for
 - `kernel`: treat the function as an entry-point kernel
 - `optimize`: determines if the code is optimized, which includes kernel-specific
   optimizations if `kernel` is true
@@ -26,11 +26,11 @@ The following keyword arguments are supported:
 See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref)
 """
 function code_llvm(io::IO, @nospecialize(func), @nospecialize(types);
-                   optimize::Bool=true, agent::HSAAgent=get_default_agent(),
+                   optimize::Bool=true, device::RuntimeDevice=default_device(),
                    dump_module::Bool=false, raw::Bool=false,
                    kernel::Bool=false, strict::Bool=false, kwargs...)
     tt = Base.to_tuple_type(types)
-    job = CompilerJob(func, tt, agent, kernel; kwargs...)
+    job = CompilerJob(func, tt, device, kernel; kwargs...)
     code_llvm(io, job; optimize=optimize,
               raw=raw, dump_module=dump_module, strict=strict)
 end
@@ -49,14 +49,14 @@ code_llvm(@nospecialize(func), @nospecialize(types); kwargs...) =
     code_llvm(stdout, func, types; kwargs...)
 
 """
-    code_gcn([io], f, types; agent::HSAAgent=get_default_agent(), kernel=false, raw=false, strict=false)
+    code_gcn([io], f, types; device::RuntimeDevice=default_device(), kernel=false, raw=false, strict=false)
 
 Prints the GCN assembly generated for the method matching the given generic function and
 type signature to `io` which defaults to `stdout`.
 
 The following keyword arguments are supported:
 
-- `agent`: which device to generate code for
+- `device`: which device to generate code for
 - `kernel`: treat the function as an entry-point kernel
 - `raw`: return the raw code including all metadata
 - `strict`: verify generate code as early as possible
@@ -64,10 +64,10 @@ The following keyword arguments are supported:
 See also: [`@device_code_gcn`](@ref)
 """
 function code_gcn(io::IO, @nospecialize(func), @nospecialize(types);
-                  agent::HSAAgent=get_default_agent(), kernel::Bool=false,
+                  device::RuntimeDevice=default_device(), kernel::Bool=false,
                   raw::Bool=false, strict::Bool=false, kwargs...)
     tt = Base.to_tuple_type(types)
-    job = CompilerJob(func, tt, agent, kernel; kwargs...)
+    job = CompilerJob(func, tt, device, kernel; kwargs...)
     code_gcn(io, job; raw=raw, strict=strict)
 end
 
diff --git a/src/runtime.jl b/src/runtime.jl
new file mode 100644
index 0000000..c38e295
--- /dev/null
+++ b/src/runtime.jl
@@ -0,0 +1,51 @@
+struct RuntimeDevice{D}
+    device::D
+end
+default_device() = RuntimeDevice(default_device(RUNTIME[]))
+default_device(::typeof(HSA)) = HSARuntime.get_default_agent()
+
+struct RuntimeQueue{Q}
+    queue::Q
+end
+default_queue(device) = RuntimeQueue(default_queue(RUNTIME[], device))
+default_queue(::typeof(HSA), device) =
+    HSARuntime.get_default_queue(device.device)
+get_device(queue::RuntimeQueue{HSAQueue}) = RuntimeDevice(queue.queue.agent)
+
+default_isa(device::RuntimeDevice{HSAAgent}) =
+    HSARuntime.get_first_isa(device.device)
+
+struct RuntimeEvent{E}
+    event::E
+end
+#create_event(device::RuntimeDevice{HSAAgent}) = RuntimeEvent(HSASignal())
+Base.wait(event::RuntimeEvent) = wait(event.event)
+
+struct RuntimeExecutable{E}
+    exe::E
+end
+create_executable(device, func) =
+    RuntimeExecutable(create_executable(RUNTIME[], device, func))
+function create_executable(::typeof(HSA), device, func)
+    data = link_kernel(func)
+
+    return HSAExecutable(device.device, data, func.entry)
+end
+
+struct RuntimeKernel{K,A}
+    kernel::K
+    args::A
+end
+create_kernel(device, exe, entry, args) =
+    RuntimeKernel(create_kernel(RUNTIME[], device, exe, entry, args), args)
+create_kernel(::typeof(HSA), device, exe, entry, args) =
+    HSAKernelInstance(device.device, exe.exe, entry, args)
+launch_kernel(queue, kern; kwargs...) =
+    launch_kernel(RUNTIME[], queue, kern; kwargs...)
+function launch_kernel(::typeof(HSA), queue, kern;
+                       groupsize=nothing, gridsize=nothing)
+    signal = HSASignal()
+    HSARuntime.launch!(queue.queue, kern.kernel, signal;
+                       workgroup_size=groupsize, grid_size=gridsize)
+    return RuntimeEvent(signal)
+end
diff --git a/test/opencl.jl b/test/opencl.jl
new file mode 100644
index 0000000..90a759b
--- /dev/null
+++ b/test/opencl.jl
@@ -0,0 +1,38 @@
+using OpenCL
+AMDGPUnative.RUNTIME[] = AMDGPUnative.OCL
+using LinearAlgebra
+
+if cl.api.libopencl != "" && length(cl.devices()) > 0
+@testset "OpenCL runtime integration" begin
+
+@test typeof(AMDGPUnative.default_device().device) <: cl.Device
+
+function mysum(X,Y,Z)
+    idx = AMDGPUnative.workitemIdx_x()
+    Z[idx] = X[idx] + Y[idx]
+    nothing
+end
+
+dev = AMDGPUnative.default_device()
+q = AMDGPUnative.default_queue(dev).queue
+dev = dev.device
+ctx = AMDGPUnative.get_context(q)
+
+A = rand(Float32, 8)
+B = rand(Float32, 8)
+
+A_ = cl.Buffer(Float32, ctx, (:rw, :copy), hostbuf=A)
+B_ = cl.Buffer(Float32, ctx, (:rw, :copy), hostbuf=B)
+C_ = cl.Buffer(Float32, ctx, :rw, length(A))
+
+@test Base.unsafe_convert(Ptr{Nothing}, A_.id) != AMDGPUnative.get_deviceptr(q, A_)
+
+@roc groupsize=1 gridsize=1 mysum(A_,B_,C_)
+
+cl.finish(q)
+C = cl.read(q, C_)
+@test all(x->x>0, C)
+@test isapprox(norm(C - (A+B)), zero(Float32))
+
+end # @testset
+end # if
diff --git a/test/runtests.jl b/test/runtests.jl
index 7aba725..1ff4af0 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -82,4 +82,6 @@ else
     @warn("AMDGPUnative.jl has not been configured; skipping on-device tests.")
 end
 
+include("opencl.jl")
+
 end