diff --git a/Project.toml b/Project.toml index 2234b0b..624ae70 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ HSARuntime = "2c364e2c-59fb-59c3-96f3-194112e690e0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" +Requires = "ae029012-a4dd-5104-9daa-d747884805df" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" [compat] diff --git a/src/AMDGPUnative.jl b/src/AMDGPUnative.jl index 4c518b0..a12c8b8 100644 --- a/src/AMDGPUnative.jl +++ b/src/AMDGPUnative.jl @@ -7,6 +7,16 @@ using Adapt using TimerOutputs using DataStructures using Libdl +using Requires + +@enum DeviceRuntime HSA OCL +const RUNTIME = Ref{DeviceRuntime}(HSA) +#= +if get(ENV, "AMDGPUNATIVE_OPENCL", "") != "" + RUNTIME[] = OCL +end +=# +include("runtime.jl") const configured = HSARuntime.configured @@ -29,6 +39,7 @@ include("reflection.jl") function __init__() check_deps() + @require OpenCL="08131aa3-fb12-5dee-8b74-c09406e224a2" include("opencl.jl") __init_compiler__() end diff --git a/src/compiler/common.jl b/src/compiler/common.jl index 00b5630..86c5067 100644 --- a/src/compiler/common.jl +++ b/src/compiler/common.jl @@ -4,7 +4,7 @@ struct CompilerJob # core invocation f::Base.Callable tt::DataType - agent::HSAAgent + device::RuntimeDevice kernel::Bool # optional properties @@ -14,10 +14,10 @@ struct CompilerJob maxregs::Union{Nothing,Integer} name::Union{Nothing,String} - CompilerJob(f, tt, agent, kernel; name=nothing, + CompilerJob(f, tt, device, kernel; name=nothing, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing) = - new(f, tt, agent, kernel, minthreads, maxthreads, blocks_per_sm, + new(f, tt, device, kernel, minthreads, maxthreads, blocks_per_sm, maxregs, name) end diff --git a/src/compiler/driver.jl b/src/compiler/driver.jl index 52d7ab4..ceeb10d 100644 --- a/src/compiler/driver.jl +++ b/src/compiler/driver.jl @@ -4,11 +4,11 @@ const compile_hook = Ref{Union{Nothing,Function}}(nothing) """ - compile(target::Symbol, agent::HSAAgent, f, tt, kernel=true; + compile(target::Symbol, device::RuntimeDevice, f, tt, kernel=true; libraries=true, optimize=true, strip=false, strict=true, ...) -Compile a function `f` invoked with types `tt` for agent `agent` to one of the -following formats as specified by the `target` argument: `:julia` for Julia -IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked +Compile a function `f` invoked with types `tt` for device `device` to one of +the following formats as specified by the `target` argument: `:julia` for +Julia IR, `:llvm` for LLVM IR, `:gcn` for GCN assembly, and `:roc` for linked objects. If the `kernel` flag is set, specialized code generation and optimization for kernel functions is enabled. The following keyword arguments are supported: @@ -18,11 +18,11 @@ The following keyword arguments are supported: - `strict`: perform code validation either as early or as late as possible Other keyword arguments can be found in the documentation of [`rocfunction`](@ref). """ -compile(target::Symbol, agent::HSAAgent, @nospecialize(f::Core.Function), +compile(target::Symbol, device::RuntimeDevice, @nospecialize(f::Core.Function), @nospecialize(tt), kernel::Bool=true; libraries::Bool=true, optimize::Bool=true, strip::Bool=false, strict::Bool=true, kwargs...) = - compile(target, CompilerJob(f, tt, agent, kernel; kwargs...); + compile(target, CompilerJob(f, tt, device, kernel; kwargs...); libraries=libraries, optimize=optimize, strip=strip, strict=strict) @@ -88,7 +88,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, # always preload the runtime, and do so early; it cannot be part of any timing block # because it recurses into the compiler if libraries - runtime = load_runtime(job.agent) + runtime = load_runtime(job.device) runtime_fns = LLVM.name.(defs(runtime)) end @@ -105,7 +105,7 @@ function codegen(target::Symbol, job::CompilerJob; libraries::Bool=true, end =# # FIXME: Load this only when needed - device_libs = load_device_libs(job.agent) + device_libs = load_device_libs(job.device) for lib in device_libs if need_library(ir, lib) @timeit to[] "device library" link_device_lib!(job, ir, lib) diff --git a/src/compiler/mcgen.jl b/src/compiler/mcgen.jl index f7285a4..428c3c5 100644 --- a/src/compiler/mcgen.jl +++ b/src/compiler/mcgen.jl @@ -1,12 +1,12 @@ # machine code generation -function machine(agent::HSAAgent, triple::String) +function machine(device::RuntimeDevice, triple::String) InitializeAMDGPUTarget() InitializeAMDGPUTargetInfo() t = Target(triple) InitializeAMDGPUTargetMC() - cpu = get_first_isa(agent) # TODO: Make this configurable + cpu = default_isa(device) # TODO: Make this configurable feat = "" tm = TargetMachine(t, triple, cpu, feat) asm_verbosity!(tm, true) @@ -80,7 +80,7 @@ end function mcgen(job::CompilerJob, mod::LLVM.Module, f::LLVM.Function; output_format=LLVM.API.LLVMObjectFile) - tm = machine(job.agent, triple(mod)) + tm = machine(job.device, triple(mod)) InitializeAMDGPUAsmPrinter() return String(emit(tm, mod, output_format)) diff --git a/src/compiler/optim.jl b/src/compiler/optim.jl index a777842..b04f959 100644 --- a/src/compiler/optim.jl +++ b/src/compiler/optim.jl @@ -1,7 +1,7 @@ # LLVM IR optimization function optimize!(job::CompilerJob, mod::LLVM.Module, entry::LLVM.Function) - tm = AMDGPUnative.machine(job.agent, triple(mod)) + tm = AMDGPUnative.machine(job.device, triple(mod)) if job.kernel entry = promote_kernel!(job, mod, entry) diff --git a/src/compiler/rtlib.jl b/src/compiler/rtlib.jl index 0661523..ec0b5d9 100644 --- a/src/compiler/rtlib.jl +++ b/src/compiler/rtlib.jl @@ -30,10 +30,10 @@ const libcache = Dict{String, LLVM.Module}() # ROCm device library -function load_device_libs(agent) +function load_device_libs(device) device_libs_path === nothing && return - isa_short = replace(get_first_isa(agent), "gfx"=>"") + isa_short = replace(default_isa(device), "gfx"=>"") device_libs = LLVM.Module[] bitcode_files = ( "hc.amdgcn.bc", @@ -132,20 +132,20 @@ end ## functionality to build the runtime library -function emit_function!(mod, agent, f, types, name) +function emit_function!(mod, device, f, types, name) tt = Base.to_tuple_type(types) - new_mod, entry = codegen(:llvm, CompilerJob(f, tt, agent, #=kernel=# false); + new_mod, entry = codegen(:llvm, CompilerJob(f, tt, device, #=kernel=# false); libraries=false, strict=false) LLVM.name!(entry, name) link!(mod, new_mod) end -function build_runtime(agent) +function build_runtime(device) mod = LLVM.Module("AMDGPUnative run-time library", JuliaContext()) for method in values(Runtime.methods) try - emit_function!(mod, agent, method.def, method.types, method.llvm_name) + emit_function!(mod, device, method.def, method.types, method.llvm_name) catch err @warn method end @@ -154,8 +154,8 @@ function build_runtime(agent) mod end -function load_runtime(agent::HSAAgent) - isa = get_first_isa(agent) +function load_runtime(device::RuntimeDevice) + isa = default_isa(device) name = "amdgpunative.$isa.bc" path = joinpath(@__DIR__, "..", "..", "deps", "runtime", name) mkpath(dirname(path)) @@ -167,7 +167,7 @@ function load_runtime(agent::HSAAgent) end else @info "Building the AMDGPUnative run-time library for your $isa device, this might take a while..." - lib = build_runtime(agent) + lib = build_runtime(device) open(path, "w") do io write(io, lib) end diff --git a/src/execution.jl b/src/execution.jl index 6fa9e9c..b1662e4 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -3,7 +3,7 @@ export @roc, rocconvert, rocfunction struct Kernel{F,TT} - agent::HSAAgent + device::RuntimeDevice mod::ROCModule fun::ROCFunction end @@ -11,8 +11,9 @@ end # `split_kwargs()` segregates keyword arguments passed to `@roc` into those # affecting the compiler, kernel execution, or both. function split_kwargs(kwargs) - compiler_kws = [:agent, :queue, :name] - call_kws = [:groupsize, :gridsize, :agent, :queue] + # TODO: Alias groupsize and gridsize as threads and blocks, respectively + compiler_kws = [:device, :agent, :queue, :name] + call_kws = [:groupsize, :gridsize, :device, :agent, :queue] compiler_kwargs = [] call_kwargs = [] for kwarg in kwargs @@ -60,6 +61,23 @@ function assign_args!(code, args) return vars, var_exprs end +function extract_device(;device=nothing, agent=nothing, kwargs...) + if device !== nothing + return device + elseif agent !== nothing + return agent + else + return default_device() + end +end +function extract_queue(device; queue=nothing, kwargs...) + if queue !== nothing + return queue + else + return default_queue(device) + end +end + # fast lookup of global world age world_age() = ccall(:jl_get_tls_world_age, UInt, ()) @@ -125,11 +143,11 @@ macro roc(ex...) GC.@preserve $(vars...) begin local kernel_args = map(rocconvert, ($(var_exprs...),)) local kernel_tt = Tuple{Core.Typeof.(kernel_args)...} - local agent = get_default_agent() - local kernel = rocfunction(agent, $(esc(f)), kernel_tt; + local device = extract_device(; $(map(esc, call_kwargs)...)) + local kernel = rocfunction(device, $(esc(f)), kernel_tt; $(map(esc, compiler_kwargs)...)) - local queue = get_default_queue(agent) - local signal = HSASignal() + local queue = extract_queue(device; $(map(esc, call_kwargs)...)) + local signal = create_event() kernel(queue, signal, kernel_args...; $(map(esc, call_kwargs)...)) wait(signal) end @@ -188,7 +206,7 @@ The output of this function is automatically cached, i.e. you can simply call generated automatically, when the function changes, or when different types or keyword arguments are provided. """ -@generated function rocfunction(agent::HSAAgent, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) +@generated function rocfunction(device::RuntimeDevice, f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) tt = Base.to_tuple_type(tt.parameters[1]) sig = Base.signature_type(f, tt) t = Tuple(tt.parameters) @@ -217,8 +235,8 @@ keyword arguments are provided. # compile the function if !haskey(compilecache, key) - fun, mod = compile(:roc, agent, f, tt; name=name, kwargs...) - kernel = Kernel{f,tt}(agent, mod, fun) + fun, mod = compile(:roc, device, f, tt; name=name, kwargs...) + kernel = Kernel{f,tt}(device, mod, fun) compilecache[key] = kernel end @@ -227,10 +245,10 @@ keyword arguments are provided. end rocfunction(f::Core.Function, tt::Type=Tuple{}; kwargs...) = - rocfunction(get_default_agent(), f, tt; kwargs...) + rocfunction(default_device(), f, tt; kwargs...) -@generated function call(kernel::Kernel{F,TT}, queue::HSAQueue, - signal::HSASignal, args...; call_kwargs...) where {F,TT} +@generated function call(kernel::Kernel{F,TT}, queue::RuntimeQueue, + signal::RuntimeEvent, args...; call_kwargs...) where {F,TT} sig = Base.signature_type(F, TT) args = (:F, (:( args[$i] ) for i in 1:length(args))...) diff --git a/src/execution_utils.jl b/src/execution_utils.jl index 1207971..da09381 100644 --- a/src/execution_utils.jl +++ b/src/execution_utils.jl @@ -51,7 +51,7 @@ function Base.getindex(dims::ROCDim3, idx::Int) end """ - launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, + launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, groupsize::ROCDim, gridsize::ROCDim, args...) Low-level call to launch a ROC function `f` on the GPU, using `groupsize` and @@ -63,7 +63,7 @@ copied to the internal kernel parameter buffer, or a pointer to device memory. This is a low-level call, preferably use [`roccall`](@ref) instead. """ -@inline function launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, +@inline function launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, groupsize::ROCDim, gridsize::ROCDim, args...) groupsize = ROCDim3(groupsize) gridsize = ROCDim3(gridsize) @@ -77,7 +77,7 @@ end # we need a generated function to get an args array, # without having to inspect the types at runtime -@generated function _launch(queue::HSAQueue, signal::HSASignal, f::ROCFunction, +@generated function _launch(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, groupsize::ROCDim3, gridsize::ROCDim3, args::NTuple{N,Any}) where N @@ -101,23 +101,13 @@ end GC.@preserve $(arg_refs...) begin kernelParams = [$(arg_ptrs...)] - # link with ld.lld - ld_path = HSARuntime.ld_lld_path - @assert ld_path != "" "ld.lld was not found; cannot link kernel" - # TODO: Do this more idiomatically - io = open("/tmp/amdgpu-dump.o", "w") - write(io, f.mod.data) - close(io) - run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`) - io = open("/tmp/amdgpu.exe", "r") - data = read(io) - close(io) - - # generate executable and kernel instance - exe = HSAExecutable(queue.agent, data, f.entry) - kern = HSAKernelInstance(queue.agent, exe, f.entry, args) - HSARuntime.launch!(queue, kern, signal; - workgroup_size=groupsize, grid_size=gridsize) + # create executable and kernel instance + exe = create_executable(get_device(queue), f) + kern = create_kernel(get_device(queue), exe, f.entry, args) + + # launch kernel + launch_kernel(queue, kern, signal; + groupsize=groupsize, gridsize=gridsize) end end).args) @@ -125,7 +115,7 @@ end end """ - roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types, values...; + roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, types, values...; groupsize::ROCDim, gridsize::ROCDim) `ccall`-like interface for launching a ROC function `f` on a GPU. @@ -151,14 +141,14 @@ being slightly faster. """ roccall -@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N}; +@inline function roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, types::NTuple{N,DataType}, values::Vararg{Any,N}; kwargs...) where N # this cannot be inferred properly (because types only contains `DataType`s), # which results in the call `@generated _roccall` getting expanded upon first use _roccall(queue, signal, f, Tuple{types...}, values; kwargs...) end -@inline function roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, values::Vararg{Any,N}; +@inline function roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, tt::Type, values::Vararg{Any,N}; kwargs...) where N # in this case, the type of `tt` is `Tuple{<:DataType,...}`, # which means the generated function can be expanded earlier @@ -167,7 +157,7 @@ end # we need a generated function to get a tuple of converted arguments (using unsafe_convert), # without having to inspect the types at runtime -@generated function _roccall(queue::HSAQueue, signal::HSASignal, f::ROCFunction, tt::Type, args::NTuple{N,Any}; +@generated function _roccall(queue::RuntimeQueue, signal::RuntimeEvent, f::ROCFunction, tt::Type, args::NTuple{N,Any}; groupsize::ROCDim=1, gridsize::ROCDim=1) where N # the type of `tt` is Type{Tuple{<:DataType...}} diff --git a/src/opencl.jl b/src/opencl.jl new file mode 100644 index 0000000..a3bfff2 --- /dev/null +++ b/src/opencl.jl @@ -0,0 +1,3 @@ +# OpenCL runtime interface to AMDGPUnative + +include("opencl/args.jl") diff --git a/src/opencl/args.jl b/src/opencl/args.jl new file mode 100644 index 0000000..5b58185 --- /dev/null +++ b/src/opencl/args.jl @@ -0,0 +1,12 @@ +# OpenCL argument utilities + +# Argument accessors + +# __agocl_global_offset_x - OpenCL Global Offset X +# __agocl_global_offset_y - OpenCL Global Offset Y +# __agocl_global_offset_z - OpenCL Global Offset Z +# __agocl_printf_addr - OpenCL address of printf buffer +# __agocl_queue_addr - OpenCL address of virtual queue used by enqueue_kernel +# __agocl_aqlwrap_addr - OpenCL address of AqlWrap struct used by enqueue_kernel +# __agocl_multigrid - Pointer argument used for Multi-grid synchronization + diff --git a/src/reflection.jl b/src/reflection.jl index 5baeef5..8b8daa3 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -7,7 +7,7 @@ using InteractiveUtils # and/or to support generating otherwise invalid code (e.g. with missing symbols). """ - code_llvm([io], f, types; optimize=true, agent::HSAAgent=get_default_agent(), kernel=false, + code_llvm([io], f, types; optimize=true, device::RuntimeDevice=default_device(), kernel=false, optimize=true, raw=false, dump_module=false, strict=false) Prints the device LLVM IR generated for the method matching the given generic function and @@ -15,7 +15,7 @@ type signature to `io` which defaults to `stdout`. The following keyword arguments are supported: -- `agent`: which device to generate code for +- `device`: which device to generate code for - `kernel`: treat the function as an entry-point kernel - `optimize`: determines if the code is optimized, which includes kernel-specific optimizations if `kernel` is true @@ -26,11 +26,11 @@ The following keyword arguments are supported: See also: [`@device_code_llvm`](@ref), [`InteractiveUtils.code_llvm`](@ref) """ function code_llvm(io::IO, @nospecialize(func), @nospecialize(types); - optimize::Bool=true, agent::HSAAgent=get_default_agent(), + optimize::Bool=true, device::RuntimeDevice=default_device(), dump_module::Bool=false, raw::Bool=false, kernel::Bool=false, strict::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - job = CompilerJob(func, tt, agent, kernel; kwargs...) + job = CompilerJob(func, tt, device, kernel; kwargs...) code_llvm(io, job; optimize=optimize, raw=raw, dump_module=dump_module, strict=strict) end @@ -49,14 +49,14 @@ code_llvm(@nospecialize(func), @nospecialize(types); kwargs...) = code_llvm(stdout, func, types; kwargs...) """ - code_gcn([io], f, types; agent::HSAAgent=get_default_agent(), kernel=false, raw=false, strict=false) + code_gcn([io], f, types; device::RuntimeDevice=default_device(), kernel=false, raw=false, strict=false) Prints the GCN assembly generated for the method matching the given generic function and type signature to `io` which defaults to `stdout`. The following keyword arguments are supported: -- `agent`: which device to generate code for +- `device`: which device to generate code for - `kernel`: treat the function as an entry-point kernel - `raw`: return the raw code including all metadata - `strict`: verify generate code as early as possible @@ -64,10 +64,10 @@ The following keyword arguments are supported: See also: [`@device_code_gcn`](@ref) """ function code_gcn(io::IO, @nospecialize(func), @nospecialize(types); - agent::HSAAgent=get_default_agent(), kernel::Bool=false, + device::RuntimeDevice=default_device(), kernel::Bool=false, raw::Bool=false, strict::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - job = CompilerJob(func, tt, agent, kernel; kwargs...) + job = CompilerJob(func, tt, device, kernel; kwargs...) code_gcn(io, job; raw=raw, strict=strict) end diff --git a/src/runtime.jl b/src/runtime.jl new file mode 100644 index 0000000..616f5c3 --- /dev/null +++ b/src/runtime.jl @@ -0,0 +1,58 @@ +struct RuntimeDevice{D} + device::D +end +default_device() = RuntimeDevice(default_device(RUNTIME[])) +default_device(::typeof(HSA)) = HSARuntime.get_default_agent() + +struct RuntimeQueue{Q} + queue::Q +end +default_queue(device) = RuntimeQueue(default_queue(RUNTIME[], device)) +default_queue(::typeof(HSA), device) = + HSARuntime.get_default_queue(device.device) +get_device(queue::RuntimeQueue{HSAQueue}) = RuntimeDevice(queue.queue.agent) + +default_isa(device::RuntimeDevice{HSAAgent}) = + HSARuntime.get_first_isa(device.device) + +struct RuntimeEvent{E} + event::E +end +create_event() = RuntimeEvent(create_event(RUNTIME[])) +create_event(::typeof(HSA)) = HSASignal() +Base.wait(event::RuntimeEvent) = wait(event.event) + +struct RuntimeExecutable{E} + exe::E +end +create_executable(device, func) = + RuntimeExecutable(create_executable(RUNTIME[], device, func)) +function create_executable(::typeof(HSA), device, func) + # link with ld.lld + ld_path = HSARuntime.ld_lld_path + @assert ld_path != "" "ld.lld was not found; cannot link kernel" + # TODO: Do this more idiomatically + io = open("/tmp/amdgpu-dump.o", "w") + write(io, func.mod.data) + close(io) + run(`$ld_path -shared -o /tmp/amdgpu.exe /tmp/amdgpu-dump.o`) + io = open("/tmp/amdgpu.exe", "r") + data = read(io) + close(io) + + return HSAExecutable(device.device, data, func.entry) +end + +struct RuntimeKernel{K} + kernel::K +end +create_kernel(device, exe, entry, args) = + RuntimeKernel(create_kernel(RUNTIME[], device, exe, entry, args)) +create_kernel(::typeof(HSA), device, exe, entry, args) = + HSAKernelInstance(device.device, exe.exe, entry, args) +launch_kernel(queue, kern, event; kwargs...) = + launch_kernel(RUNTIME[], queue, kern, event; kwargs...) +launch_kernel(::typeof(HSA), queue, kern, event; + groupsize=nothing, gridsize=nothing) = + HSARuntime.launch!(queue.queue, kern.kernel, event.event; + workgroup_size=groupsize, grid_size=gridsize)