Skip to content
This repository was archived by the owner on Nov 18, 2020. It is now read-only.

Implement decent kernel exceptions #67

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions Manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,11 @@ uuid = "9fa8497b-333b-5362-9e8d-4d0656e87820"

[[GPUCompiler]]
deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs", "UUIDs"]
git-tree-sha1 = "3f405c2aab2ef755d022bd57466a35c6d08a1531"
repo-rev = "158cd601fc42faed088785a7bde16436cbaa6017"
git-tree-sha1 = "65f7395a1245635f0c2279649fdbef09a1b0aa7b"
repo-rev = "master"
repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git"
uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
version = "0.3.0"
version = "0.4.0"

[[HSARuntime]]
deps = ["CEnum", "Libdl", "Setfield"]
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Requires = "ae029012-a4dd-5104-9daa-d747884805df"
[compat]
Adapt = "0.4, 1.0"
BinaryProvider = "0.5"
GPUCompiler = "0.3"
GPUCompiler = "0.4"
HSARuntime = "0.3"
LLVM = "1.3"
Requires = "1"
Expand Down
1 change: 1 addition & 0 deletions src/AMDGPUnative.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ include(joinpath("device", "globals.jl"))
include("compiler.jl")
include("execution_utils.jl")
include("execution.jl")
include("exceptions.jl")
include("reflection.jl")

function __init__()
Expand Down
9 changes: 8 additions & 1 deletion src/compiler.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,14 @@ function GPUCompiler.process_module!(job::ROCCompilerJob, mod::LLVM.Module)
invoke(GPUCompiler.process_module!,
Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
job, mod)
#emit_exception_flag!(mod)
# Run this early (before optimization) to ensure we link OCKL
emit_exception_user!(mod)
end
function GPUCompiler.finish_module!(job::ROCCompilerJob, mod::LLVM.Module)
invoke(GPUCompiler.finish_module!,
Tuple{CompilerJob{GCNCompilerTarget}, typeof(mod)},
job, mod)
delete_exception_user!(mod)
end

function GPUCompiler.link_libraries!(job::ROCCompilerJob, mod::LLVM.Module,
Expand Down
11 changes: 7 additions & 4 deletions src/device/gcn.jl
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
if Base.libllvm_version >= v"7.0"
include(joinpath("gcn", "math.jl"))
end
# HSA dispatch packet offsets
_packet_names = fieldnames(HSA.KernelDispatchPacket)
_packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names))

include(joinpath("gcn", "math.jl"))
include(joinpath("gcn", "indexing.jl"))
include(joinpath("gcn", "assertion.jl"))
include(joinpath("gcn", "synchronization.jl"))
include(joinpath("gcn", "memory_static.jl"))
include(joinpath("gcn", "memory_dynamic.jl"))
include(joinpath("gcn", "hostcall.jl"))
include(joinpath("gcn", "output.jl"))
include(joinpath("gcn", "memory_dynamic.jl"))
include(joinpath("gcn", "execution_control.jl"))
41 changes: 41 additions & 0 deletions src/device/gcn/execution_control.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
## completion signal

const completion_signal_base = _packet_offsets[findfirst(x->x==:completion_signal,_packet_names)]

@generated function _completion_signal()
T_int8 = LLVM.Int8Type(JuliaContext())
T_int64 = LLVM.Int64Type(JuliaContext())
_as = convert(Int, AS.Constant)
T_ptr_i8 = LLVM.PointerType(T_int8, _as)
T_ptr_i64 = LLVM.PointerType(T_int64, _as)

# create function
llvm_f, _ = create_function(T_int64)
mod = LLVM.parent(llvm_f)

# generate IR
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)

# get the kernel dispatch pointer
intr_typ = LLVM.FunctionType(T_ptr_i8)
intr = LLVM.Function(mod, "llvm.amdgcn.dispatch.ptr", intr_typ)
ptr = call!(builder, intr)

# load the index
signal_ptr_i8 = inbounds_gep!(builder, ptr, [ConstantInt(completion_signal_base, JuliaContext())])
signal_ptr = bitcast!(builder, signal_ptr_i8, T_ptr_i64)
signal = load!(builder, signal_ptr)
ret!(builder, signal)
end

call_function(llvm_f, UInt64)
end

signal_completion(value::Int64) = device_signal_store!(_completion_signal(), value)

## misc. intrinsics
@inline sendmsg(x1, x2=Int32(0)) = ccall("llvm.amdgcn.s.sendmsg", llvmcall, Cvoid, (Int32, Int32), x1, x2)
@inline sendmsghalt(x1, x2=Int32(0)) = ccall("llvm.amdgcn.s.sendmsghalt", llvmcall, Cvoid, (Int32, Int32), x1, x2)
@inline endpgm() = @asmcall("s_endpgm", "", true)
4 changes: 1 addition & 3 deletions src/device/gcn/indexing.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ end
@generated function _dim(::Val{base}, ::Val{off}, ::Val{range}, ::Type{T}) where {base, off, range, T}
T_int8 = LLVM.Int8Type(JuliaContext())
T_int32 = LLVM.Int32Type(JuliaContext())
_as = Base.libllvm_version < v"7.0" ? 2 : 4
_as = convert(Int, AS.Constant)
T_ptr_i8 = LLVM.PointerType(T_int8, _as)
T_ptr_i32 = LLVM.PointerType(T_int32, _as)
T_ptr_T = LLVM.PointerType(convert(LLVMType, T), _as)
Expand Down Expand Up @@ -91,8 +91,6 @@ for dim in (:x, :y, :z)
cufn = Symbol("blockIdx_$dim")
@eval @inline $cufn() = $fn()
end
_packet_names = fieldnames(HSA.KernelDispatchPacket)
_packet_offsets = fieldoffset.(HSA.KernelDispatchPacket, 1:length(_packet_names))
for (dim,off) in ((:x,1), (:y,2), (:z,3))
# Workitem dimension
fn = Symbol("workgroupDim_$dim")
Expand Down
77 changes: 74 additions & 3 deletions src/device/gcn/memory_dynamic.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,75 @@
export malloc
export malloc, free

# Stub implementation
malloc(::Csize_t) = C_NULL
function malloc(sz::Csize_t)
malloc_gbl = get_global_pointer(Val(:__global_malloc_hostcall),
HostCall{UInt64,DevicePtr{UInt8,AS.Global},Tuple{Csize_t}})
malloc_hc = Base.unsafe_load(malloc_gbl)
ptr = hostcall!(malloc_hc, sz)
if UInt64(ptr) != 0
kernel_metadata_insert!(ptr, sz)
end
return ptr
end

function free(ptr::DevicePtr{T,AS.Global}) where T
free_gbl = get_global_pointer(Val(:__global_free_hostcall),
HostCall{UInt64,Nothing,Tuple{DevicePtr{UInt8,AS.Global}}})
free_hc = Base.unsafe_load(free_gbl)
hostcall!(free_hc, Base.unsafe_convert(DevicePtr{UInt8,AS.Global}, ptr))
kernel_metadata_delete!(ptr)
end

# metadata store
struct MetadataInsertException <: Exception
kern::UInt64
end
function Base.showerror(io::IO, mae::MetadataInsertException)
print(io, "MetadataInsertException: Failed to insert metadata for kernel ")
print(io, mae.kern)
end
function kernel_metadata_insert!(ptr, sz)
metadata_gbl = get_global_pointer(Val(:__global_metadata_store), KernelMetadata)
offset = 1
while true
# FIXME: atomic_load
metadata = Base.unsafe_load(metadata_gbl, offset)
if metadata.kern == 0
# empty metadata slot, use it
# FIXME: atomic_store!
Base.unsafe_store!(metadata_gbl, KernelMetadata(_completion_signal(), ptr, sz), offset)
return true
elseif metadata.kern == 1
# tail slot, error
# FIXME: throw(MetadataInsertException(_completion_signal()))
return false
else
# slot in use, skip it
offset += 1
end
end
end
function kernel_metadata_delete!(ptr)
metadata_gbl = get_global_pointer(Val(:__global_metadata_store), KernelMetadata)
offset = 1
our_signal = _completion_signal()
while true
# FIXME: atomic_load
metadata = Base.unsafe_load(metadata_gbl, offset)
if metadata.kern == our_signal
# our slot, clear it
# FIXME: atomic_store!
metadata_gbl_ptr = convert(DevicePtr{UInt8,AS.Global},
Base.unsafe_convert(Ptr{KernelMetadata}, metadata_gbl) +
(sizeof(KernelMetadata)*(offset-1)))
memset!(metadata_gbl_ptr, 0x0, Csize_t(sizeof(KernelMetadata)))
return true
elseif metadata.kern == 1
# tail slot, error
# FIXME: throw(MetadataDeleteException(_completion_signal()))
return false
else
# not our slot, skip it
offset += 1
end
end
end
71 changes: 71 additions & 0 deletions src/device/gcn/memory_static.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,3 +41,74 @@ export alloc_special

call_function(llvm_f, DevicePtr{T,as})
end

@inline @generated function alloc_string(::Val{str}) where str
T_pint8_generic = LLVM.PointerType(LLVM.Int8Type(JuliaContext()), convert(Int, AS.Generic))
llvm_f, _ = create_function(LLVM.Int64Type(JuliaContext()))
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)
str_ptr = globalstring_ptr!(builder, String(str))
str_ptr_i64 = ptrtoint!(builder, str_ptr, LLVM.Int64Type(JuliaContext()))
ret!(builder, str_ptr_i64)
end
call_function(llvm_f, DevicePtr{UInt8,AS.Generic})
end

# TODO: Support various types of len
@inline @generated function memcpy!(dest_ptr::DevicePtr{UInt8,DestAS}, src_ptr::DevicePtr{UInt8,SrcAS}, len::LT) where {DestAS,SrcAS,LT<:Union{Int64,UInt64}}
T_nothing = LLVM.VoidType(JuliaContext())
dest_as = convert(Int, DestAS)
src_as = convert(Int, SrcAS)
T_int8 = LLVM.Int8Type(JuliaContext())
T_int64 = LLVM.Int64Type(JuliaContext())
T_pint8_dest = LLVM.PointerType(T_int8, dest_as)
T_pint64_dest = LLVM.PointerType(T_int64, dest_as)
T_pint8_src = LLVM.PointerType(T_int8, src_as)
T_pint64_src = LLVM.PointerType(T_int64, src_as)
T_int1 = LLVM.Int1Type(JuliaContext())

llvm_f, _ = create_function(T_nothing, [T_int64, T_int64, T_int64])
mod = LLVM.parent(llvm_f)
T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_pint8_src, T_int64, T_int1])
intr = LLVM.Function(mod, "llvm.memcpy.p$(dest_as)i8.p$(src_as)i8.i64", T_intr)
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)

dest_ptr_i64 = inttoptr!(builder, parameters(llvm_f)[1], T_pint64_dest)
dest_ptr_i8 = bitcast!(builder, dest_ptr_i64, T_pint8_dest)

src_ptr_i64 = inttoptr!(builder, parameters(llvm_f)[2], T_pint64_src)
src_ptr_i8 = bitcast!(builder, src_ptr_i64, T_pint8_src)

call!(builder, intr, [dest_ptr_i8, src_ptr_i8, parameters(llvm_f)[3], ConstantInt(T_int1, 0)])
ret!(builder)
end
call_function(llvm_f, Nothing, Tuple{DevicePtr{UInt8,DestAS},DevicePtr{UInt8,SrcAS},LT}, :((dest_ptr, src_ptr, len)))
end
@inline @generated function memset!(dest_ptr::DevicePtr{UInt8,DestAS}, value::UInt8, len::LT) where {DestAS,LT<:Union{Int64,UInt64}}
T_nothing = LLVM.VoidType(JuliaContext())
dest_as = convert(Int, DestAS)
T_int8 = LLVM.Int8Type(JuliaContext())
T_int64 = LLVM.Int64Type(JuliaContext())
T_pint8_dest = LLVM.PointerType(T_int8, dest_as)
T_pint64_dest = LLVM.PointerType(T_int64, dest_as)
T_int1 = LLVM.Int1Type(JuliaContext())

llvm_f, _ = create_function(T_nothing, [T_int64, T_int8, T_int64])
mod = LLVM.parent(llvm_f)
T_intr = LLVM.FunctionType(T_nothing, [T_pint8_dest, T_int8, T_int64, T_int1])
intr = LLVM.Function(mod, "llvm.memset.p$(dest_as)i8.i64", T_intr)
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)

dest_ptr_i64 = inttoptr!(builder, parameters(llvm_f)[1], T_pint64_dest)
dest_ptr_i8 = bitcast!(builder, dest_ptr_i64, T_pint8_dest)

call!(builder, intr, [dest_ptr_i8, parameters(llvm_f)[2], parameters(llvm_f)[3], ConstantInt(T_int1, 0)])
ret!(builder)
end
call_function(llvm_f, Nothing, Tuple{DevicePtr{UInt8,DestAS},UInt8,LT}, :((dest_ptr, value, len)))
end
65 changes: 29 additions & 36 deletions src/device/gcn/output.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ function OutputContext(io::IO=stdout; agent=get_default_agent(), buf_len=2^16, k
OutputContext(hc)
end

const GLOBAL_OUTPUT_CONTEXT_TYPE = OutputContext{HostCall{UInt64,Int64,Tuple{DeviceStaticString{2^16}}}}

### macros

macro rocprint(oc, str)
Expand All @@ -33,9 +35,30 @@ macro rocprintln(oc, str)
rocprint(oc, str, true)
end

macro rocprint(str)
@gensym oc_ptr oc
ex = quote
$(esc(oc_ptr)) = AMDGPUnative.get_global_pointer(Val(:__global_output_context),
$GLOBAL_OUTPUT_CONTEXT_TYPE)
$(esc(oc)) = Base.unsafe_load($(esc(oc_ptr)))
end
push!(ex.args, rocprint(oc, str))
ex
end
macro rocprintln(str)
@gensym oc_ptr oc
ex = quote
$(esc(oc_ptr)) = AMDGPUnative.get_global_pointer(Val(:__global_output_context),
$GLOBAL_OUTPUT_CONTEXT_TYPE)
$(esc(oc)) = Base.unsafe_load($(esc(oc_ptr)))
end
push!(ex.args, rocprint(oc, str, true))
ex
end

### parse-time helpers

function rocprint(oc, str, nl=false)
function rocprint(oc, str, nl::Bool=false)
ex = Expr(:block)
if !(str isa Expr)
str = Expr(:string, str)
Expand All @@ -50,18 +73,14 @@ function rocprint(oc, str, nl=false)
dstr = DeviceStaticString{N}()
push!(ex.args, :(hostcall!($(esc(oc)).hostcall, $dstr)))
end
push!(ex.args, :(nothing))
return ex
end
function rocprint!(ex, N, oc, str::String)
# TODO: push!(ex.args, :($rocprint!($(esc(oc)), $(Val(Symbol(str))))))
off = N
ptr = :(Base.unsafe_convert(DevicePtr{UInt8,AS.Global}, $(esc(oc)).hostcall.buf_ptr))
for byte in codeunits(str)
push!(ex.args, :(Base.unsafe_store!($ptr, $byte, $off)))
off += 1
end

return off
@gensym str_ptr
push!(ex.args, :($str_ptr = AMDGPUnative.alloc_string($(Val(Symbol(str))))))
push!(ex.args, :(AMDGPUnative.memcpy!($(esc(oc)).hostcall.buf_ptr+$(N-1), $str_ptr, $(length(str)))))
return N+length(str)
end
function rocprint!(ex, N, oc, char::Char)
@assert char == '\0' "Non-null chars not yet implemented"
Expand All @@ -84,29 +103,3 @@ end
=#

### runtime helpers

#= TODO: LLVM hates me, but this should eventually work
# FIXME: Pass N and offset oc.buf_ptr appropriately
@inline @generated function rocprint!(oc::OutputContext, ::Val{str}) where str
T_int1 = LLVM.Int1Type(JuliaContext())
T_int32 = LLVM.Int32Type(JuliaContext())
T_pint8 = LLVM.PointerType(LLVM.Int8Type(JuliaContext()))
T_pint8_global = LLVM.PointerType(LLVM.Int8Type(JuliaContext()), convert(Int, AS.Global))
T_nothing = LLVM.VoidType(JuliaContext())
llvm_f, _ = create_function(T_nothing, [T_pint8_global])
mod = LLVM.parent(llvm_f)
T_intr = LLVM.FunctionType(T_nothing, [T_pint8_global, T_pint8, T_int32, T_int32, T_int1])
intr = LLVM.Function(mod, "llvm.memcpy.p1i8.p0i8.i32", T_intr)
Builder(JuliaContext()) do builder
entry = BasicBlock(llvm_f, "entry", JuliaContext())
position!(builder, entry)
str_ptr = globalstring_ptr!(builder, String(str))
buf_ptr = parameters(llvm_f)[1]
# NOTE: There's a hidden alignment parameter (argument 4) that's not documented in the LangRef
call!(builder, intr, [buf_ptr, str_ptr, ConstantInt(Int32(length(string(str))), JuliaContext()), ConstantInt(Int32(2), JuliaContext()), ConstantInt(T_int1, 0)])
ret!(builder)
end
Core.println(unsafe_string(LLVM.API.LLVMPrintValueToString(LLVM.ref(llvm_f))))
call_function(llvm_f, Nothing, Tuple{DevicePtr{UInt8,AS.Global}}, :((oc.hostcall.buf_ptr,)))
end
=#
Loading