diff --git a/Project.toml b/Project.toml index 2234b0b..b5c0bfd 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ HSARuntime = "2c364e2c-59fb-59c3-96f3-194112e690e0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [compat] @@ -22,8 +23,10 @@ TimerOutputs = "0.5" julia = "1" [extras] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" +OpenCL = "08131aa3-fb12-5dee-8b74-c09406e224a2" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["SpecialFunctions", "Test"] +test = ["LinearAlgebra", "OpenCL", "SpecialFunctions", "Test"] diff --git a/src/AMDGPUnative.jl b/src/AMDGPUnative.jl index 4c518b0..83047ed 100644 --- a/src/AMDGPUnative.jl +++ b/src/AMDGPUnative.jl @@ -7,6 +7,14 @@ using Adapt using TimerOutputs using DataStructures using Libdl +using Requires + +@enum DeviceRuntime HSA OCL +const RUNTIME = Ref{DeviceRuntime}(HSA) +if get(ENV, "AMDGPUNATIVE_OPENCL", "") != "" + RUNTIME[] = OCL +end +include("runtime.jl") const configured = HSARuntime.configured @@ -21,6 +29,7 @@ include(joinpath("device", "pointer.jl")) include(joinpath("device", "array.jl")) include(joinpath("device", "gcn.jl")) include(joinpath("device", "runtime.jl")) +include(joinpath("device", "llvm.jl")) include("execution_utils.jl") include("compiler.jl") @@ -29,6 +38,7 @@ include("reflection.jl") function __init__() check_deps() + @require OpenCL="08131aa3-fb12-5dee-8b74-c09406e224a2" include("opencl.jl") __init_compiler__() end diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 00b5630..86c5067 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -4,7 +4,7 @@ struct CompilerJob # core invocation f::Base.Callable tt::DataType - agent::HSAAgent + device::RuntimeDevice kernel::Bool # optional properties @@ -14,10 +14,10 @@ struct CompilerJob maxregs::Union{Nothing,Integer} name::Union{Nothing,String} - CompilerJob(f, tt, agent, kernel; name=nothing, + CompilerJob(f, tt, device, kernel; name=nothing, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing) = - new(f, tt, agent, kernel, minthreads, maxthreads, blocks_per_sm, + new(f, tt, device, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name) end diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 52d7ab4..ceeb10d 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -4,11 +4,11 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ - compile(target::Symbol, agent::HSAAgent, f, tt, kernel=true; + compile(target::Symbol, device::RuntimeDevice, f, tt, kernel=true; libraries=true, optimize=true, strip=false, strict=true, ...) -Compile a function `f` invoked with types `tt` for agent `agent` to one of the -following formats as specified by the `target` argument: `:julia` for Julia -IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked +Compile a function `f` invoked with types `tt` for device `device` to one of +the following formats as specified by the `target` argument: `:julia` for +Julia IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked objects. If the `kernel` flag is set, specialized code generation and optimization for kernel functions is enabled. The following keyword arguments are supported: @@ -18,11 +18,11 @@ The following keyword arguments are supported: - `strict`: perform code validation either as early or as late as possible Other keyword arguments can be found in the documentation of [`rocfunction`](@ref). """ -compile(target::Symbol, agent::HSAAgent, @nospecialize(f::Core.Function), +compile(target::Symbol, device::RuntimeDevice, @nospecialize(f::Core.Function), @nospecialize(tt), kernel::Bool=true; libraries::Bool=true, optimize::Bool=true, strip::Bool=false, strict::Bool=true, kwargs...) = - compile(target, CompilerJob(f, tt, agent, kernel; kwargs...); + compile(target, CompilerJob(f, tt, device, kernel; kwargs...); libraries=libraries, optimize=optimize, strip=strip, strict=strict) @@ -88,7 +88,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, # always preload the runtime, and do so early; it cannot be part of any timing block # because it recurses into the compiler if libraries - runtime = load_runtime(job.agent) + runtime = load_runtime(job.device) runtime_fns = LLVM.name.(defs(runtime)) end @@ -105,7 +105,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, end =# # FIXME: Load this only when needed - device_libs = load_device_libs(job.agent) + device_libs = load_device_libs(job.device) for lib in device_libs if need_library(ir, lib) @timeit to[] "device library" link_device_lib!(job, ir, lib) diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl index f7285a4..428c3c5 100644 --- a/src/compiler/mcgen.jl +++ b/src/compiler/mcgen.jl @@ -1,12 +1,12 @@ # machine code generation -function machine(agent::HSAAgent, triple::String) +function machine(device::RuntimeDevice, triple::String) InitializeAMDGPUTarget() InitializeAMDGPUTargetInfo() t = Target(triple) InitializeAMDGPUTargetMC() - cpu = get_first_isa(agent) # TODO: Make this configurable + cpu = default_isa(device) # TODO: Make this configurable feat = "" tm = TargetMachine(t, triple, cpu, feat) asm_verbosity!(tm, true) @@ -80,7 +80,7 @@ end function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function; output_format=LLVM.API.LLVMObjectFile) - tm = machine(job.agent, triple(mod)) + tm = machine(job.device, triple(mod)) InitializeAMDGPUAsmPrinter() return String(emit(tm, mod, output_format)) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index a777842..b04f959 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1,7 +1,7 @@ # LLVM IR optimization function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) - tm = AMDGPUnative.machine(job.agent, triple(mod)) + tm = AMDGPUnative.machine(job.device, triple(mod)) if job.kernel entry = promote_kernel!(job, mod, entry) diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 0661523..ec0b5d9 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -30,10 +30,10 @@ const libcache = Dict{String, LLVM.Module}() # ROCm device library -function load_device_libs(agent) +function load_device_libs(device) device_libs_path === nothing && return - isa_short = replace(get_first_isa(agent), "gfx"=>"") + isa_short = replace(default_isa(device), "gfx"=>"") device_libs = LLVM.Module[] bitcode_files = ( "hc.amdgcn.bc", @@ -132,20 +132,20 @@ end ## functionality to build the runtime library -function emit_function!(mod, agent, f, types, name) +function emit_function!(mod, device, f, types, name) tt = Base.to_tuple_type(types) - new_mod, entry = codegen(:llvm, CompilerJob(f, tt, agent, #=kernel=# false); + new_mod, entry = codegen(:llvm, CompilerJob(f, tt, device, #=kernel=# false); libraries=false, strict=false) LLVM.name!(entry, name) link!(mod, new_mod) end -function build_runtime(agent) +function build_runtime(device) mod = LLVM.Module("AMDGPUnative run-time library", JuliaContext()) for method in values(Runtime.methods) try - emit_function!(mod, agent, method.def, method.types, method.llvm_name) + emit_function!(mod, device, method.def, method.types, method.llvm_name) catch err @warn method end @@ -154,8 +154,8 @@ function build_runtime(agent) mod end -function load_runtime(agent::HSAAgent) - isa = get_first_isa(agent) +function load_runtime(device::RuntimeDevice) + isa = default_isa(device) name = "amdgpunative.$isa.bc" path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) mkpath(dirname(path)) @@ -167,7 +167,7 @@ function load_runtime(agent::HSAAgent) end else @info "Building the AMDGPUnative run-time library for your $isa device, this might take a while..." - lib = build_runtime(agent) + lib = build_runtime(device) open(path, "w") do io write(io, lib) end diff --git a/src/execution.jl b/src/execution.jl index 6fa9e9c..d0e79b7 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -3,7 +3,7 @@ export @roc, rocconvert, rocfunction struct Kernel{F,TT} - agent::HSAAgent + device::RuntimeDevice mod::ROCModule fun::ROCFunction end @@ -11,8 +11,9 @@ end # `split_kwargs()` segregates keyword arguments passed to `@roc` into those # affecting the compiler, kernel execution, or both. function split_kwargs(kwargs) - compiler_kws = [:agent, :queue, :name] - call_kws = [:groupsize, :gridsize, :agent, :queue] + # TODO: Alias groupsize and gridsize as threads and blocks, respectively + compiler_kws = [:device, :agent, :queue, :name] + call_kws = [:groupsize, :gridsize, :device, :agent, :queue] compiler_kwargs = [] call_kwargs = [] for kwarg in kwargs @@ -60,6 +61,23 @@ function assign_args!(code, args) return vars, var_exprs end +function extract_device(;device=nothing, agent=nothing, kwargs...) + if device !== nothing + return device + elseif agent !== nothing + return agent + else + return default_device() + end +end +function extract_queue(device; queue=nothing, kwargs...) + if queue !== nothing + return queue + else + return default_queue(device) + end +end + # fast lookup of global world age world_age() = ccall(:jl_get_tls_world_age, UInt, ()) @@ -125,13 +143,12 @@ macro roc(ex...) GC.@preserve $(vars...) begin local kernel_args = map(rocconvert, ($(var_exprs...),)) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local agent = get_default_agent() - local kernel = rocfunction(agent, $(esc(f)), kernel_tt; + local device = extract_device(; $(map(esc, call_kwargs)...)) + local kernel = rocfunction(device, $(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) - local queue = get_default_queue(agent) - local signal = HSASignal() - kernel(queue, signal, kernel_args...; $(map(esc, call_kwargs)...)) - wait(signal) + local queue = extract_queue(device; $(map(esc, call_kwargs)...)) + local event = kernel(queue, kernel_args...; $(map(esc, call_kwargs)...)) + wait(event) end end) return code @@ -188,7 +205,7 @@ The output of this function is automatically cached, i.e. you can simply call generated automatically, when the function changes, or when different types or keyword arguments are provided. """ -@generated function rocfunction(agent::HSAAgent, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) +@generated function rocfunction(device::RuntimeDevice, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) tt = Base.to_tuple_type(tt.parameters[1]) sig = Base.signature_type(f, tt) t = Tuple(tt.parameters) @@ -217,8 +234,8 @@ keyword arguments are provided. # compile the function if !haskey(compilecache, key) - fun, mod = compile(:roc, agent, f, tt; name=name, kwargs...) - kernel = Kernel{f,tt}(agent, mod, fun) + fun, mod = compile(:roc, device, f, tt; name=name, kwargs...) + kernel = Kernel{f,tt}(device, mod, fun) compilecache[key] = kernel end @@ -227,10 +244,10 @@ keyword arguments are provided. end rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) = - rocfunction(get_default_agent(), f, tt; kwargs...) + rocfunction(default_device(), f, tt; kwargs...) -@generated function call(kernel::Kernel{F,TT}, queue::HSAQueue, - signal::HSASignal, args...; call_kwargs...) where {F,TT} +@generated function call(kernel::Kernel{F,TT}, queue::RuntimeQueue, + args...; call_kwargs...) where {F,TT} sig = Base.signature_type(F, TT) args = (:F, (:( args[$i] ) for i in 1:length(args))...) @@ -254,7 +271,7 @@ rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) = quote Base.@_inline_meta - roccall(queue, signal, kernel.fun, $call_tt, $(call_args...); call_kwargs...) + roccall(queue, kernel.fun, $call_tt, $(call_args...); call_kwargs...) end end diff --git a/src/execution_utils.jl b/src/execution_utils.jl index 1207971..b469687 100644 --- a/src/execution_utils.jl +++ b/src/execution_utils.jl @@ -50,20 +50,32 @@ function Base.getindex(dims::ROCDim3, idx::Int) error("Invalid dimension: $idx") end +function link_kernel(func::ROCFunction) + # link with ld.lld + ld_path = HSARuntime.ld_lld_path + @assert ld_path != "" "ld.lld was not found; cannot link kernel" + mktemp() do objpath, objio + write(objio, func.mod.data) + flush(objio) + run(`$ld_path -shared -o $(objpath*".exe") $objpath`) + data = read(objpath*".exe") + end +end + """ - launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, + launch(queue::RuntimeQueue, f::ROCFunction, groupsize::ROCDim, gridsize::ROCDim, args...) Low-level call to launch a ROC function `f` on the GPU, using `groupsize` and `gridsize` as the grid and block configuration, respectively. The kernel is -launched on `queue` and is waited on by `signal`. +launched on `queue` and returns a `RuntimeEvent`. Arguments to a kernel should either be bitstype, in which case they will be copied to the internal kernel parameter buffer, or a pointer to device memory. This is a low-level call, preferably use [`roccall`](@ref) instead. """ -@inline function launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, +@inline function launch(queue::RuntimeQueue, f::ROCFunction, groupsize::ROCDim, gridsize::ROCDim, args...) groupsize = ROCDim3(groupsize) gridsize = ROCDim3(gridsize) @@ -72,12 +84,12 @@ This is a low-level call, preferably use [`roccall`](@ref) instead. (gridsize.x>0 && gridsize.y>0 && gridsize.z>0) || throw(ArgumentError("Grid dimensions should be non-null")) - _launch(queue, signal, f, groupsize, gridsize, args...) + _launch(queue, f, groupsize, gridsize, args...) end # we need a generated function to get an args array, # without having to inspect the types at runtime -@generated function _launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, +@generated function _launch(queue::RuntimeQueue, f::ROCFunction, groupsize::ROCDim3, gridsize::ROCDim3, args::NTuple{N,Any}) where N @@ -101,23 +113,12 @@ end GC.@preserve $(arg_refs...) begin kernelParams = [$(arg_ptrs...)] - # link with ld.lld - ld_path = HSARuntime.ld_lld_path - @assert ld_path != "" "ld.lld was not found; cannot link kernel" - # TODO: Do this more idiomatically - io = open("/tmp/amdgpu-dump.o", "w") - write(io, f.mod.data) - close(io) - run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`) - io = open("/tmp/amdgpu.exe", "r") - data = read(io) - close(io) - - # generate executable and kernel instance - exe = HSAExecutable(queue.agent, data, f.entry) - kern = HSAKernelInstance(queue.agent, exe, f.entry, args) - HSARuntime.launch!(queue, kern, signal; - workgroup_size=groupsize, grid_size=gridsize) + # create executable and kernel instance + exe = create_executable(get_device(queue), f) + kern = create_kernel(get_device(queue), exe, f.entry, args) + + # launch kernel + launch_kernel(queue, kern; groupsize=groupsize, gridsize=gridsize) end end).args) @@ -125,7 +126,7 @@ end end """ - roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types, values...; + roccall(queue::RuntimeQueue, f::ROCFunction, types, values...; groupsize::ROCDim, gridsize::ROCDim) `ccall`-like interface for launching a ROC function `f` on a GPU. @@ -151,24 +152,26 @@ being slightly faster. """ roccall -@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N}; - kwargs...) where N +@inline function roccall(queue::RuntimeQueue, f::ROCFunction, + types::NTuple{N,DataType}, values::Vararg{Any,N}; + kwargs...) where N # this cannot be inferred properly (because types only contains `DataType`s), # which results in the call `@generated _roccall` getting expanded upon first use - _roccall(queue, signal, f, Tuple{types...}, values; kwargs...) + _roccall(queue, f, Tuple{types...}, values; kwargs...) end -@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, values::Vararg{Any,N}; - kwargs...) where N +@inline function roccall(queue::RuntimeQueue, f::ROCFunction, tt::Type, + values::Vararg{Any,N}; kwargs...) where N # in this case, the type of `tt` is `Tuple{<:DataType,...}`, # which means the generated function can be expanded earlier - _roccall(queue, signal, f, tt, values; kwargs...) + _roccall(queue, f, tt, values; kwargs...) end # we need a generated function to get a tuple of converted arguments (using unsafe_convert), # without having to inspect the types at runtime -@generated function _roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, args::NTuple{N,Any}; - groupsize::ROCDim=1, gridsize::ROCDim=1) where N +@generated function _roccall(queue::RuntimeQueue, f::ROCFunction, tt::Type, + args::NTuple{N,Any}; + groupsize::ROCDim=1, gridsize::ROCDim=1) where N # the type of `tt` is Type{Tuple{<:DataType...}} types = tt.parameters[1].parameters @@ -189,7 +192,7 @@ end append!(ex.args, (quote GC.@preserve $(converted_args...) begin - launch(queue, signal, f, groupsize, gridsize, ($(arg_ptrs...),)) + launch(queue, f, groupsize, gridsize, ($(arg_ptrs...),)) end end).args) diff --git a/src/opencl.jl b/src/opencl.jl new file mode 100644 index 0000000..2f85920 --- /dev/null +++ b/src/opencl.jl @@ -0,0 +1,101 @@ +# OpenCL runtime interface to AMDGPUnative +using OpenCL +#const cl = OpenCL.cl + +include("opencl/args.jl") +include("opencl/buffer.jl") +include("opencl/deviceptr.jl") + +const OCL_DEFAULT_DEVICE = Ref{cl.Device}() +const OCL_DEFAULT_CONTEXT = IdDict{cl.Device,cl.Context}() +const OCL_DEFAULT_CMDQUEUE = IdDict{cl.Device,cl.CmdQueue}() +devs = cl.devices() +if length(devs) > 0 + dev = first(devs) + OCL_DEFAULT_DEVICE[] = dev + ctx = cl.Context(dev) + OCL_DEFAULT_CONTEXT[dev] = ctx + queue = cl.CmdQueue(ctx) + OCL_DEFAULT_CMDQUEUE[dev] = queue +end + +default_device(::typeof(HSA)) = OCL_DEFAULT_DEVICE[] + +function default_queue(::typeof(HSA), device) + cldev = device.device + if !haskey(OCL_DEFAULT_CMDQUEUE, cldev) + if !haskey(OCL_DEFAULT_CONTEXT, cldev) + ctx = cl.Context(cldev) + OCL_DEFAULT_CONTEXT[cldev] = ctx + else + ctx = OCL_DEFAULT_CONTEXT[cldev] + end + queue = cl.CmdQueue(ctx) + OCL_DEFAULT_CMDQUEUE[cldev] = queue + return queue + else + return OCL_DEFAULT_CMDQUEUE[cldev] + end +end +get_device(queue::RuntimeQueue{cl.CmdQueue}) = + RuntimeDevice(queue.queue[:device]) +get_context(queue::cl.CmdQueue) = queue[:context] + +default_isa(device::RuntimeDevice{cl.Device}) = + device.device[:name] + +#create_event(device::RuntimeDevice{cl.Device}) = +# RuntimeEvent(cl.UserEvent(device.device)) +Base.wait(event::RuntimeEvent{cl.Event}) = cl.wait(event.event) + +function create_executable(::typeof(OCL), device, func) + data = link_kernel(func) + binaries = Dict{cl.Device, Array{UInt8}}() + binaries[device.device] = data + dev = device.device + ctx = OCL_DEFAULT_CONTEXT[dev] + return cl.Program(ctx, binaries=binaries) |> cl.build! +end + +create_kernel(::typeof(HSA), device, exe, entry, args) = + cl.Kernel(exe.exe, entry) +struct FakeDeviceArray + size::Int64 + ptr::Int64 +end +clconvert(q, x) = x +function clconvert(q, x::ROCDeviceArray{T,N,A}) where {T,N,A} + #= + cl_mem cm_buffer; + cm_buffer = clCreateBuffer(cxContext, CL_MEM_READ_ONLY, sizeof(st_foo), NULL, NULL); + clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void*)&cm_buffer); + clEnqueueWriteBuffer(cqueue, cm_buffer, CL_TRUE, 0, sizeof(st_foo), &stVar, 0, NULL, NULL); + =# + @show x + #= + @show sizeof(x) + xref = Ref(x) + GC.@preserve xref begin + xptr = Base.unsafe_convert(Ptr{UInt32}, Base.pointer_from_objref(xref)) + struct_size = div(sizeof(x),sizeof(UInt32)) + @show struct_size + arr = unsafe_wrap(Array, xptr, (struct_size,)) + @show arr + ctx = get_context(q) + buf = cl.Buffer(UInt32, ctx, (:rw, :copy); hostbuf=arr) + @show buf + end + return buf + =# + #return FakeDeviceArray(prod(x.shape), Int64(x.ptr)) + return x +end +function launch_kernel(::typeof(HSA), queue, kern; + groupsize=nothing, gridsize=nothing) + q = queue.queue + k = kern.kernel + args = map(arg->clconvert(q, arg), kern.args) + cl.set_args!(k, args...) + event = cl.enqueue_kernel(q, k, 1, 1) #=gridsize, groupsize,=# + return RuntimeEvent(event) +end diff --git a/src/opencl/args.jl b/src/opencl/args.jl new file mode 100644 index 0000000..5b58185 --- /dev/null +++ b/src/opencl/args.jl @@ -0,0 +1,12 @@ +# OpenCL argument utilities + +# Argument accessors + +# __agocl_global_offset_x - OpenCL Global Offset X +# __agocl_global_offset_y - OpenCL Global Offset Y +# __agocl_global_offset_z - OpenCL Global Offset Z +# __agocl_printf_addr - OpenCL address of printf buffer +# __agocl_queue_addr - OpenCL address of virtual queue used by enqueue_kernel +# __agocl_aqlwrap_addr - OpenCL address of AqlWrap struct used by enqueue_kernel +# __agocl_multigrid - Pointer argument used for Multi-grid synchronization + diff --git a/src/opencl/buffer.jl b/src/opencl/buffer.jl new file mode 100644 index 0000000..cfdcfe9 --- /dev/null +++ b/src/opencl/buffer.jl @@ -0,0 +1,9 @@ +## interop with cl.Buffer +function Base.convert(::Type{ROCDeviceArray{T,N,AS.Global}}, a::cl.Buffer{T}) where {T,N} + dev_ptr = get_deviceptr(default_queue(default_device()).queue, a) + @show dev_ptr + ROCDeviceArray{T,N,AS.Global}((length(a),), DevicePtr{T,AS.Global}(dev_ptr)) +end + +Adapt.adapt_storage(::Adaptor, xs::cl.Buffer{T}) where T = + convert(ROCDeviceArray{T,ndims(xs),AS.Global}, xs) diff --git a/src/opencl/deviceptr.jl b/src/opencl/deviceptr.jl new file mode 100644 index 0000000..94e27a9 --- /dev/null +++ b/src/opencl/deviceptr.jl @@ -0,0 +1,27 @@ +const deviceptr_source = """ +__kernel void getPtr(__global ulong *ptr, __global ulong *out) { + out[0] = &ptr[0]; +} +""" + +const DEVICEPTR_KERNELS = Dict{cl.Device,cl.Kernel}() + +function get_deviceptr(q::cl.CmdQueue, buf::cl.Buffer{T}) where T + ctx = q[:context] + dev = q[:device] + + host_buf = cl.Buffer(UInt64, ctx, :rw, 1) + dev_buf = cl.Buffer(UInt64, ctx, :rw, 1) + + if !haskey(DEVICEPTR_KERNELS, dev) + prog = cl.build!(cl.Program(ctx, source=deviceptr_source), options="-Werror") + kern = cl.Kernel(prog, "getPtr") + DEVICEPTR_KERNELS[dev] = kern + else + kern = DEVICEPTR_KERNELS[dev] + end + + event = q(kern, 1, nothing, host_buf, dev_buf) + cl.wait(event) + return Ptr{T}(cl.read(q, dev_buf)[1]) +end diff --git a/src/reflection.jl b/src/reflection.jl index 5baeef5..8b8daa3 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -7,7 +7,7 @@ using InteractiveUtils # and/or to support generating otherwise invalid code (e.g. with missing symbols). """ - code_llvm([io], f, types; optimize=true, agent::HSAAgent=get_default_agent(), kernel=false, + code_llvm([io], f, types; optimize=true, device::RuntimeDevice=default_device(), kernel=false, optimize=true, raw=false, dump_module=false, strict=false) Prints the device LLVM IR generated for the method matching the given generic function and @@ -15,7 +15,7 @@ type signature to `io` which defaults to `stdout`. The following keyword arguments are supported: -- `agent`: which device to generate code for +- `device`: which device to generate code for - `kernel`: treat the function as an entry-point kernel - `optimize`: determines if the code is optimized, which includes kernel-specific optimizations if `kernel` is true @@ -26,11 +26,11 @@ The following keyword arguments are supported: See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref) """ function code_llvm(io::IO, @nospecialize(func), @nospecialize(types); - optimize::Bool=true, agent::HSAAgent=get_default_agent(), + optimize::Bool=true, device::RuntimeDevice=default_device(), dump_module::Bool=false, raw::Bool=false, kernel::Bool=false, strict::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - job = CompilerJob(func, tt, agent, kernel; kwargs...) + job = CompilerJob(func, tt, device, kernel; kwargs...) code_llvm(io, job; optimize=optimize, raw=raw, dump_module=dump_module, strict=strict) end @@ -49,14 +49,14 @@ code_llvm(@nospecialize(func), @nospecialize(types); kwargs...) = code_llvm(stdout, func, types; kwargs...) """ - code_gcn([io], f, types; agent::HSAAgent=get_default_agent(), kernel=false, raw=false, strict=false) + code_gcn([io], f, types; device::RuntimeDevice=default_device(), kernel=false, raw=false, strict=false) Prints the GCN assembly generated for the method matching the given generic function and type signature to `io` which defaults to `stdout`. The following keyword arguments are supported: -- `agent`: which device to generate code for +- `device`: which device to generate code for - `kernel`: treat the function as an entry-point kernel - `raw`: return the raw code including all metadata - `strict`: verify generate code as early as possible @@ -64,10 +64,10 @@ The following keyword arguments are supported: See also: [`@device_code_gcn`](@ref) """ function code_gcn(io::IO, @nospecialize(func), @nospecialize(types); - agent::HSAAgent=get_default_agent(), kernel::Bool=false, + device::RuntimeDevice=default_device(), kernel::Bool=false, raw::Bool=false, strict::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - job = CompilerJob(func, tt, agent, kernel; kwargs...) + job = CompilerJob(func, tt, device, kernel; kwargs...) code_gcn(io, job; raw=raw, strict=strict) end diff --git a/src/runtime.jl b/src/runtime.jl new file mode 100644 index 0000000..c38e295 --- /dev/null +++ b/src/runtime.jl @@ -0,0 +1,51 @@ +struct RuntimeDevice{D} + device::D +end +default_device() = RuntimeDevice(default_device(RUNTIME[])) +default_device(::typeof(HSA)) = HSARuntime.get_default_agent() + +struct RuntimeQueue{Q} + queue::Q +end +default_queue(device) = RuntimeQueue(default_queue(RUNTIME[], device)) +default_queue(::typeof(HSA), device) = + HSARuntime.get_default_queue(device.device) +get_device(queue::RuntimeQueue{HSAQueue}) = RuntimeDevice(queue.queue.agent) + +default_isa(device::RuntimeDevice{HSAAgent}) = + HSARuntime.get_first_isa(device.device) + +struct RuntimeEvent{E} + event::E +end +#create_event(device::RuntimeDevice{HSAAgent}) = RuntimeEvent(HSASignal()) +Base.wait(event::RuntimeEvent) = wait(event.event) + +struct RuntimeExecutable{E} + exe::E +end +create_executable(device, func) = + RuntimeExecutable(create_executable(RUNTIME[], device, func)) +function create_executable(::typeof(HSA), device, func) + data = link_kernel(func) + + return HSAExecutable(device.device, data, func.entry) +end + +struct RuntimeKernel{K,A} + kernel::K + args::A +end +create_kernel(device, exe, entry, args) = + RuntimeKernel(create_kernel(RUNTIME[], device, exe, entry, args), args) +create_kernel(::typeof(HSA), device, exe, entry, args) = + HSAKernelInstance(device.device, exe.exe, entry, args) +launch_kernel(queue, kern; kwargs...) = + launch_kernel(RUNTIME[], queue, kern; kwargs...) +function launch_kernel(::typeof(HSA), queue, kern; + groupsize=nothing, gridsize=nothing) + signal = HSASignal() + HSARuntime.launch!(queue.queue, kern.kernel, signal; + workgroup_size=groupsize, grid_size=gridsize) + return RuntimeEvent(signal) +end diff --git a/test/opencl.jl b/test/opencl.jl new file mode 100644 index 0000000..90a759b --- /dev/null +++ b/test/opencl.jl @@ -0,0 +1,38 @@ +using OpenCL +AMDGPUnative.RUNTIME[] = AMDGPUnative.OCL +using LinearAlgebra + +if cl.api.libopencl != "" && length(cl.devices()) > 0 +@testset "OpenCL runtime integration" begin + +@test typeof(AMDGPUnative.default_device().device) <: cl.Device + +function mysum(X,Y,Z) + idx = AMDGPUnative.workitemIdx_x() + Z[idx] = X[idx] + Y[idx] + nothing +end + +dev = AMDGPUnative.default_device() +q = AMDGPUnative.default_queue(dev).queue +dev = dev.device +ctx = AMDGPUnative.get_context(q) + +A = rand(Float32, 8) +B = rand(Float32, 8) + +A_ = cl.Buffer(Float32, ctx, (:rw, :copy), hostbuf=A) +B_ = cl.Buffer(Float32, ctx, (:rw, :copy), hostbuf=B) +C_ = cl.Buffer(Float32, ctx, :rw, length(A)) + +@test Base.unsafe_convert(Ptr{Nothing}, A_.id) != AMDGPUnative.get_deviceptr(q, A_) + +@roc groupsize=1 gridsize=1 mysum(A_,B_,C_) + +cl.finish(q) +C = cl.read(q, C_) +@test all(x->x>0, C) +@test isapprox(norm(C - (A+B)), zero(Float32)) + +end # @testset +end # if diff --git a/test/runtests.jl b/test/runtests.jl index 7aba725..1ff4af0 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -82,4 +82,6 @@ else @warn("AMDGPUnative.jl has not been configured; skipping on-device tests.") end +include("opencl.jl") + end