From cdadcbfae885bc5b2d8303e9d166cc6f4116adba Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sun, 19 Apr 2020 23:31:02 +0200 Subject: [PATCH 01/34] Add matmul API --- Manifest.toml | 31 ++++ Project.toml | 2 + src/CUDAnative.jl | 2 + src/device/matmul_kernels.jl | 40 +++++ src/device/matmul_kernels/config.jl | 165 +++++++++++++++++++ src/device/matmul_kernels/epilogue.jl | 35 ++++ src/device/matmul_kernels/kernel.jl | 130 +++++++++++++++ src/device/matmul_kernels/layout.jl | 50 ++++++ src/device/matmul_kernels/operator.jl | 66 ++++++++ src/device/matmul_kernels/transform.jl | 28 ++++ src/device/pointer.jl | 27 +++ src/device/tiling.jl | 217 +++++++++++++++++++++++++ test/device/matmul_kernels.jl | 64 ++++++++ test/device/tiling.jl | 71 ++++++++ test/runtests.jl | 2 + 15 files changed, 930 insertions(+) create mode 100644 src/device/matmul_kernels.jl create mode 100644 src/device/matmul_kernels/config.jl create mode 100644 src/device/matmul_kernels/epilogue.jl create mode 100644 src/device/matmul_kernels/kernel.jl create mode 100644 src/device/matmul_kernels/layout.jl create mode 100644 src/device/matmul_kernels/operator.jl create mode 100644 src/device/matmul_kernels/transform.jl create mode 100644 src/device/tiling.jl create mode 100644 test/device/matmul_kernels.jl create mode 100644 test/device/tiling.jl diff --git a/Manifest.toml b/Manifest.toml index 5227a866..dc61c3bd 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -32,6 +32,11 @@ git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" version = "6.2.2" +[[Cassette]] +git-tree-sha1 = "f6a148cadd38ba328bd2c03442037ef801a6aa05" +uuid = "7057c7e9-c182-5462-911a-8362d720325c" +version = "0.3.1" + [[CodeTracking]] deps = ["InteractiveUtils", "UUIDs"] git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3" @@ -71,6 +76,12 @@ repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" version = "0.1.0" +[[GPUifyLoops]] +deps = ["Cassette", "Requires", "StaticArrays"] +git-tree-sha1 = "671b3b85510f3833c05f3846b3019edc131ab03d" +uuid = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" +version = "0.2.9" + [[InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" @@ -121,6 +132,12 @@ uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" +[[Requires]] +deps = ["UUIDs"] +git-tree-sha1 = "d37400976e98018ee840e0ca4f9d20baa231dc6b" +uuid = "ae029012-a4dd-5104-9daa-d747884805df" +version = "1.0.1" + [[SHA]] uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -130,6 +147,20 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" +[[SparseArrays]] +deps = ["LinearAlgebra", "Random"] +uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[[StaticArrays]] +deps = ["LinearAlgebra", "Random", "Statistics"] +git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c" +uuid = "90137ffa-7385-5640-81b9-e52037218182" +version = "0.12.1" + +[[Statistics]] +deps = ["LinearAlgebra", "SparseArrays"] +uuid = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" + [[Test]] deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" diff --git a/Project.toml b/Project.toml index eb639e9f..2f28a1e4 100644 --- a/Project.toml +++ b/Project.toml @@ -10,10 +10,12 @@ CUDAapi = "3895d2a7-ec45-59b8-82bb-cfc6a382f9b3" CUDAdrv = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" ExprTools = "e2ba6199-217a-4e67-a87a-7c52f15ade04" GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +GPUifyLoops = "ba82f77b-6841-5d2e-bd9f-4daf811aec27" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] Adapt = "0.4, 1.0" diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 3d69871e..af223219 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -67,6 +67,8 @@ include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") include("device/llvm.jl") +include("device/tiling.jl") +include("device/matmul_kernels.jl") using GPUCompiler include("device/runtime.jl") diff --git a/src/device/matmul_kernels.jl b/src/device/matmul_kernels.jl new file mode 100644 index 00000000..a73c89a5 --- /dev/null +++ b/src/device/matmul_kernels.jl @@ -0,0 +1,40 @@ +export MatMul +module MatMul + +using CUDAdrv +using CUDAnative + +include("matmul_kernels/layout.jl") +include("matmul_kernels/operator.jl") +include("matmul_kernels/transform.jl") +include("matmul_kernels/config.jl") +include("matmul_kernels/epilogue.jl") +include("matmul_kernels/kernel.jl") + +function matmul(a, b, c, d, conf; + transform_global_to_shared_a = Transform.Elementwise(), + transform_global_to_shared_b = Transform.Elementwise(), + transform_global_to_shared_c = Transform.Elementwise(), + transform_shared_to_global_d = Transform.Elementwise(), + transform_shared_to_regs_a = Transform.Elementwise(), + transform_shared_to_regs_b = Transform.Elementwise(), + transform_shared_to_regs_c = Transform.Elementwise(), + transform_regs_to_shared_d = Transform.Elementwise(), + epilogue = Epilogue.Default()) + + args = [a, b, c, d, + transform_global_to_shared_a, transform_global_to_shared_b, transform_global_to_shared_c, transform_shared_to_global_d, + transform_shared_to_regs_a, transform_shared_to_regs_b, transform_shared_to_regs_c, transform_regs_to_shared_d, + epilogue, + conf] + + GC.@preserve args begin + kernel_args = cudaconvert.(args) + kernel_tt = Tuple{Core.Typeof.(kernel_args)...} + kernel = cufunction(Kernel.matmul_impl, kernel_tt; ) + CUDAdrv.attributes(kernel.fun)[CUDAdrv.FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES] = 64 * 1024 + kernel(kernel_args...; conf.launch_args...) + end +end + +end diff --git a/src/device/matmul_kernels/config.jl b/src/device/matmul_kernels/config.jl new file mode 100644 index 00000000..d1c5500f --- /dev/null +++ b/src/device/matmul_kernels/config.jl @@ -0,0 +1,165 @@ +struct Config{ + #= Params =# + MATMUL_SHAPE, # MNK, overall shape of the MATMUL operation + BLOCK_SHAPE, # MNK, shape of each CTA tile + WARPS_PER_BLOCK, # scalar, number of warps per CTA + + MEM_A_WARP, # MK, shape of each warp tile during memory operations involving matrix A + MEM_A_THREAD, # MK, shape of each thread tile during memory operations involving matrix A + + MEM_B_WARP, # KN, shape of each warp tile during memory operations involving matrix B + MEM_B_THREAD, # KN, shape of each thread tile during memory operations involving matrix B + + MEM_CD_WARP, # MN, shape of each warp tile during memory operations involving matrix C or D + MEM_CD_THREAD, # MN, shape of each thread tile during memory operations involving matrix C or D + + COMPUTE_WARP, # MNK, shape of each warp tile during the inner loop computations + COMPUTE_OP_SHAPE, # MNK, shape of the operation used in the inner loop + + #= Layouts =# + GLOBAL_A_LAYOUT, # layout of the A matrix in global memory + GLOBAL_B_LAYOUT, # layout of the B matrix in global memory + GLOBAL_C_LAYOUT, # layout of the C matrix in global memory + GLOBAL_D_LAYOUT, # layout of the D matrix in global memory + + SHARED_A_LAYOUT, # layout of the A matrix in shared memory + SHARED_B_LAYOUT, # layout of the B matrix in shared memory + SHARED_C_LAYOUT, # layout of the C matrix in shared memory + SHARED_D_LAYOUT, # layout of the D matrix in shared memory + + #= Operator =# + OPERATOR, # which operator to use in the inner loop + } +end + +@inline function Base.getproperty(conf::Type{Config{MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR}}, sym::Symbol) where {MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR} + if sym == :launch_args + return (threads = WARPS_PER_BLOCK * 32, blocks = (MATMUL_SHAPE.M ÷ BLOCK_SHAPE.M, MATMUL_SHAPE.N ÷ BLOCK_SHAPE.N), shmem = 64 * 1024) + else + return getfield(conf, sym) + end +end + +function heuristic_block_shape(shared_a_layout, shared_b_layout, shared_c_layout, shared_d_layout) + # Determining the tile size of each block is a little trickier. + # We apply the following heuristics: + # 1) Ideally, the block shape in the M and N dimensions should be square or + # nearly-square to maximise data reuse. More specifically, tiling sizes + # of the form (M = k, N = k) and (M = 2k, N = k) work well. + # 2) The tile size should be as large as possible. + # 3) The size in the M and N dimension is limited by the fact that a tile of + # the C (and D) matrix of that size must fit in shared memory. + # 4) The size in the K dimension is limited by the fact that both a M x K tile + # of A and a K x N tile of B must fit in shared memory, at the same time. + + num_bytes_A(M, N, K) = prod(Layout.size(shared_a_layout, (M = M, K = K))) * sizeof(Layout.eltype(shared_a_layout)) + num_bytes_B(M, N, K) = prod(Layout.size(shared_b_layout, (K = K, N = N))) * sizeof(Layout.eltype(shared_b_layout)) + num_bytes_C(M, N, K) = prod(Layout.size(shared_c_layout, (M = M, N = N))) * sizeof(Layout.eltype(shared_c_layout)) + num_bytes_D(M, N, K) = prod(Layout.size(shared_d_layout, (M = M, N = N))) * sizeof(Layout.eltype(shared_d_layout)) + + next_MN(M, N, K) = M == N ? (2 * M, N, K) : (M, 2 * N, K) + next_K(M, N, K) = (M, N, 2 * K) + + cur = 1, 1, 1 # M, N, K + nxt = next_MN(cur...) + + while (max(num_bytes_C(nxt...), num_bytes_D(nxt...)) <= 64 * 1024) + cur = nxt + nxt = next_MN(cur...) + end + + nxt = next_K(cur...) + + while (num_bytes_A(nxt...) + num_bytes_B(nxt...) <= 64 * 1024) + cur = nxt + nxt = next_K(cur...) + end + + return (M = cur[1], N = cur[2], K = cur[3]) +end + +# Helper function that returns the logical size of a set of adjacent elements, taking care not +# to make the size larger than the parent tile +function adjacent_elements(num, parent_size) + p = Tuple(parent_size) + t = (min(num, p[1]), num ÷ min(num, p[1])) + + return typeof(parent_size)(t) +end + +function get_config(; gemm_shape, operator, global_a_layout, global_c_layout, kwargs...) + params = Dict(kwargs) + + # Use some simple heuristics to get sensible defaults for parameters the user does not specify. + + # Get the global layouts for B & D. + # Fallback to the layouts of A & C, respectively. + global_b_layout = get(params, :global_b_layout, global_a_layout) + global_d_layout = get(params, :global_d_layout, global_c_layout) + + # Get the shared layouts for A, B, C, D. + # For A & B, add padding to reduce bank conflicts, but preserve 128-bit (16 byte) alignment. + shared_a_layout = get(params, :shared_a_layout, + Layout.Padded{global_a_layout, 16 ÷ sizeof(Layout.eltype(global_a_layout))}) + shared_b_layout = get(params, :shared_b_layout, + Layout.Padded{global_b_layout, 16 ÷ sizeof(Layout.eltype(global_b_layout))}) + shared_c_layout = get(params, :shared_c_layout, global_c_layout) + shared_d_layout = get(params, :shared_d_layout, global_d_layout) + + # 8 warps in a 2 x 4 arrangement usually works well + warps_per_block = get(params, :warps_per_block, 8) + op_shape = Operator.shape(operator) + compute_warp = get(params, :compute_warp, + map(*, op_shape, (M = 2, N = 4, K = 1))) + + # Apply heuristic for block shape + block_shape = get(params, :block_shape, + heuristic_block_shape(shared_a_layout, shared_b_layout, shared_c_layout, shared_d_layout)) + + # Heuristics for memory tiling sizes: + # 1) The tiles should encompass 128 bits (16 bytes) to enable vectorisation. + # 2) The tiles should be as small as possible (i.e. each thread exactly 128 bits) to enable coalescing. + + mem_a_warp = get(params, :mem_a_warp, + adjacent_elements(32 * 16 ÷ sizeof(Layout.eltype(global_a_layout)), (M = block_shape.M, K = block_shape.K))) + mem_b_warp = get(params, :mem_b_warp, + adjacent_elements(32 * 16 ÷ sizeof(Layout.eltype(global_b_layout)), (K = block_shape.K, N = block_shape.N))) + mem_cd_warp = get(params, :mem_cd_warp, + adjacent_elements(32 * 16 ÷ sizeof(Layout.eltype(global_c_layout)), (M = block_shape.M, N = block_shape.N))) + + mem_a_thread = get(params, :mem_a_thread, + adjacent_elements(16 ÷ sizeof(Layout.eltype(global_a_layout)), (M = block_shape.M, K = block_shape.K))) + mem_b_thread = get(params, :mem_b_thread, + adjacent_elements(16 ÷ sizeof(Layout.eltype(global_b_layout)), (K = block_shape.K, N = block_shape.N))) + mem_cd_thread = get(params, :mem_cd_thread, + adjacent_elements(16 ÷ sizeof(Layout.eltype(global_c_layout)), (M = block_shape.M, N = block_shape.N))) + + return Config{ + #= Params =# + gemm_shape, + block_shape, + warps_per_block, + mem_a_warp, + mem_a_thread, + mem_b_warp, + mem_b_thread, + mem_cd_warp, + mem_cd_thread, + compute_warp, + op_shape, + + #= Layouts =# + global_a_layout, + global_b_layout, + global_c_layout, + global_d_layout, + + shared_a_layout, + shared_b_layout, + shared_c_layout, + shared_d_layout, + + #= Operators =# + operator, + } +end diff --git a/src/device/matmul_kernels/epilogue.jl b/src/device/matmul_kernels/epilogue.jl new file mode 100644 index 00000000..554f7016 --- /dev/null +++ b/src/device/matmul_kernels/epilogue.jl @@ -0,0 +1,35 @@ +module Epilogue + +using CUDAnative +using CUDAnative.Tiling +using CUDAnative.MatMul +using GPUifyLoops: @unroll + +# ---------------- +# Default epilogue +# ---------------- + +struct Default end + +@inline function (ep::Default)(d, shmem_d, transform, conf::Type{MatMul.Config{MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR}}) where {MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR} + # Constants + block_i = (blockIdx().x - 1) * BLOCK_SHAPE.M + block_j = (blockIdx().y - 1) * BLOCK_SHAPE.N + + warpId = (threadIdx().x - 1) ÷ 32 + 1 + laneId = (threadIdx().x - 1) % 32 + 1 + + gemm_sz = Tile(MATMUL_SHAPE) + block_tile = Tile(BLOCK_SHAPE) + + # Cooperatively store a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of D from shared to global memory within one threadblock + @unroll for warp_tile = parallellise(block_tile.MN, MEM_CD_WARP, warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, MEM_CD_THREAD, laneId, 32) + x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile, block_tile.MN.size) + x = transform(x, thread_tile) + Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + end + end +end + +end diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl new file mode 100644 index 00000000..864780c6 --- /dev/null +++ b/src/device/matmul_kernels/kernel.jl @@ -0,0 +1,130 @@ +module Kernel + +using CUDAnative +using CUDAnative.Tiling +using CUDAnative.MatMul +using GPUifyLoops: @unroll +using StaticArrays + +function matmul_impl(a, b, c, d, + transf_gl2sh_a, transf_gl2sh_b, transf_gl2sh_c, transf_sh2gl_d, + transf_sh2rf_a, transf_sh2rf_b, transf_sh2rf_c, transf_rf2sh_d, + epilogue, + conf::Type{MatMul.Config{MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR}}) where {MATMUL_SHAPE, BLOCK_SHAPE, WARPS_PER_BLOCK, MEM_A_WARP, MEM_A_THREAD, MEM_B_WARP, MEM_B_THREAD, MEM_CD_WARP, MEM_CD_THREAD, COMPUTE_WARP, COMPUTE_OP_SHAPE, GLOBAL_A_LAYOUT, GLOBAL_B_LAYOUT, GLOBAL_C_LAYOUT, GLOBAL_D_LAYOUT, SHARED_A_LAYOUT, SHARED_B_LAYOUT, SHARED_C_LAYOUT, SHARED_D_LAYOUT, OPERATOR} + # Calculate the number of fragments needed to fully cover a warp tile + NUM_FRAGMENTS_M = COMPUTE_WARP.M ÷ COMPUTE_OP_SHAPE.M + NUM_FRAGMENTS_N = COMPUTE_WARP.N ÷ COMPUTE_OP_SHAPE.N + + # Constants + block_i = (blockIdx().x - 1) * BLOCK_SHAPE.M + block_j = (blockIdx().y - 1) * BLOCK_SHAPE.N + + warpId = (threadIdx().x - 1) ÷ 32 + 1 + laneId = (threadIdx().x - 1) % 32 + 1 + + gemm_sz = Tile(MATMUL_SHAPE) + block_tile = Tile(BLOCK_SHAPE) + + # (1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of C from global to shared memory within one threadblock + shmem_c = @cuDynamicSharedMem(Layout.eltype(SHARED_C_LAYOUT), Layout.size(SHARED_C_LAYOUT, block_tile.MN.size)) + + @unroll for warp_tile = parallellise(block_tile.MN, MEM_CD_WARP, warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, MEM_CD_THREAD, laneId, 32) + x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + x = transf_gl2sh_c(x, thread_tile) + Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile, block_tile.MN.size) + end + end + + sync_threads() + + # (2) Load a COMPUTE_WARP.M x COMPUTE_WARP.N tile of C from shared memory into registers + warp_tile = subdivide(block_tile.MN, (M = COMPUTE_WARP.M, N = COMPUTE_WARP.N), warpId, WARPS_PER_BLOCK) + + c_frags = MArray{Tuple{NUM_FRAGMENTS_M, NUM_FRAGMENTS_N}, Operator.fragtype_accum(OPERATOR, SHARED_C_LAYOUT)}(undef) + + @unroll for i = 1 : NUM_FRAGMENTS_M + @unroll for j = 1 : NUM_FRAGMENTS_N + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile, block_tile.MN.size), tile) + end + end + + sync_threads() + + # (3) Compute a BLOCK_SHAPE.M x BLOCK_SHAPE.N x BLOCK_SHAPE.K matrix product within one threadblock + shmem_a = @cuDynamicSharedMem(Layout.eltype(SHARED_A_LAYOUT), Layout.size(SHARED_A_LAYOUT, block_tile.MK.size)) + shmem_b = @cuDynamicSharedMem(Layout.eltype(SHARED_B_LAYOUT), Layout.size(SHARED_B_LAYOUT, block_tile.KN.size), + length(shmem_a) * sizeof(Layout.eltype(SHARED_A_LAYOUT))) + + @unroll for block_k = 0 : block_tile.size.K : gemm_sz.size.K - 1 + # (3.1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.K tile of A from global to shared memory within one threadblock + @unroll for warp_tile = parallellise(block_tile.MK, MEM_A_WARP, warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, MEM_A_THREAD, laneId, 32) + x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)), gemm_sz.MK.size) + x = transf_gl2sh_a(x, thread_tile) + Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile, block_tile.MK.size) + end + end + + # (3.2) Cooperatively load a BLOCK_SHAPE.K x BLOCK_SHAPE.N tile of B from global to shared memory within one threadblock + @unroll for warp_tile = parallellise(block_tile.KN, MEM_B_WARP, warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, MEM_B_THREAD, laneId, 32) + x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)), gemm_sz.KN.size) + x = transf_gl2sh_b(x, thread_tile) + Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile, block_tile.KN.size) + end + end + + sync_threads() + + # (3.3) Calculate a COMPUTE_WARP.M x COMPUTE_WARP.N tile of D, using a COMPUTE_WARP.M x COMPUTE_WARP.N x COMPUTE_WARP.K operation + @unroll for warp_tile = parallellise(block_tile, COMPUTE_WARP, warpId, WARPS_PER_BLOCK) + # (3.3.1) Load a COMPUTE_WARP.M x COMPUTE_WARP.K tile of A from shared memory into registers + a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) + + @unroll for i = 1 : NUM_FRAGMENTS_M + a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile, block_tile.MK.size), a_tile) + end + + # (3.3.2) Load a COMPUTE_WARP.K x COMPUTE_WARP.N tile of B from shared memory into registers + b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) + + @unroll for j = 1 : NUM_FRAGMENTS_N + b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile, block_tile.KN.size), b_tile) + end + + # (3.3.3) Compute a COMPUTE_WARP.M x COMPUTE_WARP.N x COMPUTE_WARP.K matrix product within one warp + @unroll for i = 1 : NUM_FRAGMENTS_M + @unroll for j = 1 : NUM_FRAGMENTS_N + @inbounds c_frags[i, j] = Operator.mma(OPERATOR, a_frags[i], b_frags[j], c_frags[i, j]) + end + end + end + + sync_threads() + end + + # (4) Store the COMPUTE_WARP.M x COMPUTE_WARP.N tile of D from registers to shared memory + shmem_d = @cuDynamicSharedMem(Layout.eltype(SHARED_D_LAYOUT), Layout.size(SHARED_D_LAYOUT, block_tile.MN.size)) + + warp_tile = subdivide(block_tile.MN, (M = COMPUTE_WARP.M, N = COMPUTE_WARP.N), warpId, WARPS_PER_BLOCK) + + @unroll for i = 1 : NUM_FRAGMENTS_M + @unroll for j = 1 : NUM_FRAGMENTS_N + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile, block_tile.MN.size) + end + end + + sync_threads() + + # (5) Run the epilogue + epilogue(d, shmem_d, transf_sh2gl_d, conf) + + return +end + +end diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl new file mode 100644 index 00000000..3768c1d9 --- /dev/null +++ b/src/device/matmul_kernels/layout.jl @@ -0,0 +1,50 @@ +export Layout +module Layout + +using CUDAnative +using CUDAnative.Tiling + +# ----------- +# Layout base +# ----------- + +abstract type LayoutBase{T} end + +@inline eltype(::Type{<:LayoutBase{T}}) where {T} = T +@inline size(::Type{<:LayoutBase{T}}, logical_size::NamedTuple) where {T} = Tuple(logical_size) + +# -------------- +# Padded layouts +# -------------- + +struct Padded{L, P} end + +@inline function pad_logical_coord(::Type{Padded{L, P}}, crd::NamedTuple) where {L, P} + t = Tuple(crd) + return typeof(crd)((Base.first(t) + P, Base.tail(t)...)) +end + +@inline eltype(::Type{Padded{L, P}}) where {L, P} = eltype(L) +@inline size(::Type{Padded{L, P}}, logical_size::NamedTuple) where {L, P} = size(L, pad_logical_coord(Padded{L, P}, logical_size)) +@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile, pad_logical_coord(Padded{L, P}, logical_size)) +@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile, logical_size::NamedTuple) where {L, P} = store!(L, workspace, value, tile::Tile, pad_logical_coord(Padded{L, P}, logical_size)) + +# --------------- +# AlignedColMajor +# --------------- + +struct AlignedColMajor{T} <: LayoutBase{T} end + +@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile, logical_size::NamedTuple) where {T} + N = 16 ÷ sizeof(T) + ptr = pointer(workspace, linearise(tile.base, logical_size)) + return vloada(Vec{N, T}, ptr, linearise(tile.offset, logical_size)) +end + +@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile, logical_size::NamedTuple) where {T} + N = 16 ÷ sizeof(T) + ptr = pointer(workspace, linearise(tile.base, logical_size)) + return vstorea!(Vec{N, T}, ptr, value, linearise(tile.offset, logical_size)) +end + +end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl new file mode 100644 index 00000000..a7cdc132 --- /dev/null +++ b/src/device/matmul_kernels/operator.jl @@ -0,0 +1,66 @@ +export Operator +module Operator + +using CUDAnative +using CUDAnative.MatMul +using CUDAnative.Tiling + +# ------------------------------------- +# Default definition for padded layouts +# ------------------------------------- + +# Fragment types +for f in (:fragtype_a, :fragtype_b, :fragtype_accum) + @eval @inline $f(op, ::Type{Layout.Padded{L, P}}, args...) where {L, P} = $f(op, L, args...) +end + +# Load fragments +for f in (:load_a, :load_b, :load_c) + @eval @inline $f(op, ::Type{Layout.Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = $f(op, L, workspace, tile, Layout.pad_logical_coord(Layout.Padded{L, P}, logical_size)) +end + +# Store fragments +@inline store_d(op, ::Type{Layout.Padded{L, P}}, workspace, frag, tile::Tile, logical_size::NamedTuple) where {L, P} = store_d(op, L, workspace, frag, tile, Layout.pad_logical_coord(Layout.Padded{L, P}, logical_size)) + +# ---- +# WMMA +# ---- + +struct WMMAOp{M, N, K} end + +@inline shape(::Type{WMMAOp{M, N, K}}) where {M, N, K} = (M = M, N = N, K = K) + +@inline fragtype_a(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixA} +@inline fragtype_b(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB} +@inline fragtype_accum(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float32}}) = WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator} + +function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ptr = pointer(workspace, linearise(tile.index, logical_size)) + return WMMA.load_a(ptr, logical_size.M, WMMA.ColMajor, conf) +end + +function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ptr = pointer(workspace, linearise(tile.index, logical_size)) + return WMMA.load_b(ptr, logical_size.K, WMMA.ColMajor, conf) +end + +function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ptr = pointer(workspace, linearise(tile.index, logical_size)) + return WMMA.load_c(ptr, logical_size.M, WMMA.ColMajor, conf) +end + +function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile, logical_size::NamedTuple) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ptr = pointer(workspace, linearise(tile.index, logical_size)) + WMMA.store_d(ptr, frag, logical_size.M, WMMA.ColMajor, conf) +end + +function mma(::Type{WMMAOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + return WMMA.mma(a_frag, b_frag, c_frag, conf) +end + +end diff --git a/src/device/matmul_kernels/transform.jl b/src/device/matmul_kernels/transform.jl new file mode 100644 index 00000000..50b669b7 --- /dev/null +++ b/src/device/matmul_kernels/transform.jl @@ -0,0 +1,28 @@ +export Transform +module Transform + +# --------------------- +# Elementwise transform +# --------------------- + +export Elementwise + +""" + Elementwise{F} + +A simple transformation that applies a function elementwise. + +# Example +```julia +double_elements = Elementwise(x -> x * 2) +``` +""" +struct Elementwise{F} + func::F +end + +@inline Elementwise() = Elementwise(identity) + +@inline (transf::Elementwise)(x, tile) = transf.func.(x) + +end diff --git a/src/device/pointer.jl b/src/device/pointer.jl index 14b942a9..e6596d1e 100644 --- a/src/device/pointer.jl +++ b/src/device/pointer.jl @@ -261,3 +261,30 @@ unsafe_cached_load(p::DevicePtr{<:LDGTypes,AS.Global}, i::Integer=1, align::Val= # e.g. destruct/load/reconstruct, but that's too complicated for what it's worth. unsafe_cached_load(p::DevicePtr, i::Integer=1, align::Val=Val(1)) = pointerref(p, Int(i), align) + +# TODO: make this less hacky + +export Vec +struct Vec{N, T} end + +export vloada +@inline @generated function vloada(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, i::Integer = 1) where {N, T, AS} + alignment = sizeof(T) * N + vec_len = (sizeof(T) * N) ÷ sizeof(Float32) + + return quote + vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) + return unsafe_load(vec_ptr, (i - 1) ÷ N + 1, Val($alignment)) + end +end + +export vstorea! +@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, x, i::Integer = 1) where {N, T, AS} + alignment = sizeof(T) * N + vec_len = (sizeof(T) * N) ÷ sizeof(Float32) + + return quote + vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) + unsafe_store!(vec_ptr, x, (i - 1) ÷ N + 1, Val($alignment)) + end +end diff --git a/src/device/tiling.jl b/src/device/tiling.jl new file mode 100644 index 00000000..009cdbf7 --- /dev/null +++ b/src/device/tiling.jl @@ -0,0 +1,217 @@ +module Tiling + +# ----- +# Tiles +# ----- + +export Tile +""" + Tile{names, T} + +A [`Tile`](@ref) represents a part of a multidimensional tensor that is +contiguous and aligned to the tensor's dimensions. + +Note that the dimensions of this [`Tile`](@ref) are named. Similar to a +[`NamedTuple`](@ref), the names are stored as a type parameter `names`. + +A [`Tile`](@ref) contains several fields: +- `index`: A [`NamedTuple`](@ref) that represents the "first" multidimensional + index of the parent tensor that this tile contains. +- `base`: The part of the `index` that depends on runtime values, such as the + `threadIdx`. +- `offset`: The part of the `index` that is known at compile-time. +- `size`: A [`NamedTuple`](@ref) representing the size of the tile along each + dimension. + +You can also project a [`Tile`](@ref) (i.e. drop certain dimensions) by +accessing a special "field" of which the name is derived from the dimensions +you intend to keep. + +For example, to drop the `K` dimension of a tile containing `M`, `N` and `K` +dimensions, you can use the syntax `tile.MN`. +""" +struct Tile{names, T} + base::NamedTuple{names, T} + offset::NamedTuple{names, T} + size::NamedTuple{names, T} +end + +function Base.show(io::IO, tile::Tile{names, T}) where {names, T} + print(io, "base: ", tile.base, '\n') + print(io, "offset: ", tile.offset, '\n') + print(io, "size: ", tile.size) +end + +""" + Tile(; kw_args...) + +Creates a new [`Tile`](@ref) of the given `size`, with zero `base` and +`offset`. The `size` for each dimension must be specified by a keyword +argument. + +# Example +```julia +CUDAnative.Tiling.Tile(M = 24, N = 16, K = 4) +``` +""" +Tile(; kw_args...) = Tile((; kw_args...)) + +""" + Tile(size::NamedTuple{names, T}) + +Creates a new [`Tile`](@ref) of the given `size`, with zero `base` and +`offset`. + +# Arguments +- `size`: A `NamedTuple` representing the size of the [`Tile`](@ref). + +# Example +```julia +CUDAnative.Tiling.Tile((M = 24, N = 16, K = 4)) +``` +""" +Tile(size::NamedTuple{names, T}) where {names, T} = Tile{names, T}(map(x -> 0, size), map(x -> 0, size), size) + +@generated function getproperty_impl(tile::Tile{names, T}, ::Val{sym}) where {names, T, sym} + if sym == :base || sym == :offset || sym == :size + # fields + return :(getfield(tile, sym)) + elseif sym == :index + # index: sum of base and offset + return :(map(+, getfield(tile, :base), getfield(tile, :offset))) + else + # tile projection + sym_str = String(sym) + names = ntuple(i -> Symbol(sym_str[i]), length(sym_str)) + return :( Tile(NamedTuple{$names}(getfield(tile, :base)), NamedTuple{$names}(getfield(tile, :offset)), NamedTuple{$names}(getfield(tile, :size))) ) + end +end + +@inline Base.getproperty(tile::Tile{names, T}, sym::Symbol) where {names, T} = getproperty_impl(tile, Val(sym)) + +export linearise + +""" + linearise(coord::NamedTuple{names, T}, dims::NamedTuple{names, T}) + +Convert a multidimensional coordinate to a linear index with respect to a +tensor with dimensions `dims`. + +# Arguments +- `coord`: A `NamedTuple` representing the coordinate. +- `dims`: A `NamedTuple` representing the size of the parent tensor. +""" +@inline function linearise(coord::NamedTuple{names, T}, dims::NamedTuple{names, T}) where {names, T} + ind = Tuple(coord) .+ 1 + @inbounds return LinearIndices(Tuple(dims))[ind...] +end + +export translate + +""" + translate(tile::Tile{names, T}, offset::NamedTuple{names, T}) + +Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. + +# Arguments +- `tile`: The [`Tile`](@ref) to translate. +- `offset`: The `offset` in each dimension. +""" +@inline function translate(tile::Tile{names, T}, offset::NamedTuple{names, T}) where {names, T} + base = map(+, tile.base, offset) + return Tile(base, tile.offset, tile.size) +end + +# ------------- +# TileIterators +# ------------- + +export TileIterator + +""" + TileIterator{names, T, N, R} + +A [`TileIterator`](@ref) represents an iterator over a set of [`Tile`](@ref)s. + +See also: [`subdivide`](@ref), [`parallellise`](@ref). +""" +struct TileIterator{names, T, N, R} + parent::Tile{names, T} + tile_size::T + subtile_indices::CartesianIndices{N, R} + idx::Int32 + step::Int32 +end + +export parallellise + +""" + parallellise(tile, tiling_size, idx, size) + +Split the given `tile` in subtiles of size `tiling_size` across a group of +cooperating entities (e.g. warps, threads, ...). + +Unlike [`subdivide`](@ref), the `tile` need not be completely covered by +`count` tiles of size `tiling_size`. If that's not the case, the subtiles +are evenly parallellised across all cooperating entities. + +Returns a [`TileIterator`](@ref) that iterates over the [`Tile`](@ref)s of +the calling entity. + +# Arguments +- `tile`: The [`Tile`](@ref) to parallellise. +- `tiling_size`: A `NamedTuple` indicating the size of a subtile along each dimension. +- `idx`: The identity of the calling entity. +- `count`: The number of cooperating entities. +""" +@inline function parallellise(tile::Tile{names, T}, tiling_size::NamedTuple{names, T}, idx, count) where {names, T} + # Number of tiles along each dimension + num_tiles = map(div, Tuple(tile.size), Tuple(tiling_size)) + + parent = tile + tile_size = Tuple(tiling_size) + subtile_indices = CartesianIndices(num_tiles) + step = count + + return TileIterator(parent, tile_size, subtile_indices, convert(Int32, idx), convert(Int32, step)) +end + +export subdivide + +""" + subdivide(tile, tiling_size, idx, count) + +Split the given `tile` in subtiles of size `tiling_size` across a group of +`count` cooperating entities (e.g. warps, threads, ...). + +The given `tile` must be completely covered by `count` tiles of size +`tiling_size`. + +Returns the [`Tile`](@ref) that the calling entity is responsible for. + +# Arguments +- `tile`: The [`Tile`](@ref) to subdivide. +- `tiling_size`: A `NamedTuple` indicating the size of a subtile along each dimension. +- `idx`: The identity of the calling entity. +- `count`: The number of cooperating entities. +""" +@inline function subdivide(tile::Tile{names, T}, tiling_size::NamedTuple{names, T}, idx, count) where {names, T} + return iterate(parallellise(tile, tiling_size, idx, count))[1] +end + +@inline function Base.iterate(it::TileIterator{names, T, N, R}, state = 1) where {names, T, N, R} + if state > length(it.subtile_indices) + return nothing + end + + # Calculate base and offset in number of tiles + @inbounds base = Tuple(it.parent.base) .+ (Tuple(it.subtile_indices[it.idx]) .- 1) .* Tuple(it.tile_size) + @inbounds offset = Tuple(it.parent.offset) .+ (Tuple(it.subtile_indices[state]) .- 1) .* Tuple(it.tile_size) + + # Create tile + tile = Tile{names, T}(NamedTuple{names, T}(base), NamedTuple{names, T}(offset), NamedTuple{names, T}(it.tile_size)) + + return (tile, state + it.step) +end + +end diff --git a/test/device/matmul_kernels.jl b/test/device/matmul_kernels.jl new file mode 100644 index 00000000..f3a2d085 --- /dev/null +++ b/test/device/matmul_kernels.jl @@ -0,0 +1,64 @@ +using CUDAnative +using CUDAnative.MatMul + +################################################################################ + +@testset "Matmul API" begin + @testset "WMMA GEMM" begin + @testset "(M = $M, N = $N, K = $K)" for M in [128, 256, 1024, 2048], + N in [128, 256, 1024, 2048], + K in [128, 256, 1024, 2048] + + a_h = rand(Float16, (M, K)) / sqrt(Float16(K)) + b_h = rand(Float16, (K, N)) / sqrt(Float16(K)) + c_h = rand(Float32, (M, N)) + + a = CuArray(a_h) + b = CuArray(b_h) + c = CuArray(c_h) + d = similar(c) + + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMAOp{16, 16, 16}, + global_a_layout = Layout.AlignedColMajor{Float16}, + global_c_layout = Layout.AlignedColMajor{Float32} + ) + + MatMul.matmul(a, b, c, d, conf) + + @test all(isapprox.(Float32.(a_h) * Float32.(b_h) + c_h, Array(d); rtol = sqrt(eps(Float16)))) + end + end + + @testset "WMMA GEMM + scaling" begin + @testset "(M = $M, N = $N, K = $K, alpha = $alpha)" for M in [128, 256], + N in [128, 256], + K in [128, 256], + alpha in [2, 5] + + a_h = rand(Float16, (M, K)) / sqrt(Float16(K)) + b_h = rand(Float16, (K, N)) / sqrt(Float16(K)) + c_h = rand(Float32, (M, N)) + + a = CuArray(a_h) + b = CuArray(b_h) + c = CuArray(c_h) + d = similar(c) + + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMAOp{16, 16, 16}, + global_a_layout = Layout.AlignedColMajor{Float16}, + global_c_layout = Layout.AlignedColMajor{Float32} + ) + + MatMul.matmul(a, b, c, d, conf; + transform_shared_to_regs_c = Transform.Elementwise(x -> x * alpha)) + + @test all(isapprox.(Float32.(a_h) * Float32.(b_h) + alpha * c_h, Array(d); rtol = sqrt(eps(Float16)))) + end + end +end + +################################################################################ diff --git a/test/device/tiling.jl b/test/device/tiling.jl new file mode 100644 index 00000000..6b269d60 --- /dev/null +++ b/test/device/tiling.jl @@ -0,0 +1,71 @@ +using CUDAnative.Tiling + +################################################################################ + +@testset "Tiling API" begin + @testset "Tiles" begin + @testset "Index" begin + @test Tile(M = 4, N = 4, K = 4).index == (M = 0, N = 0, K = 0) + end + + @testset "Projection" begin + @test Tile(M = 1, N = 2, K = 3).MN == Tile(M = 1, N = 2) + @test Tile(M = 1, N = 2, K = 3).NM == Tile(N = 2, M = 1) + @test Tile(M = 1, N = 2, K = 3).M == Tile(M = 1) + @test Tile(M = 1, N = 2, K = 3).KMN == Tile(K = 3, M = 1, N = 2) + end + + @testset "Translate" begin + tile = translate(Tile(M = 10, N = 20), (M = 1, N = 2)) + @test tile.size == (M = 10, N = 20) + @test tile.base == (M = 1, N = 2) + @test tile.offset == (M = 0, N = 0) + end + + @testset "Linearise" begin + tile = Tile(M = 3, N = 5) + for i = 0 : 2, j = 0 : 4 + tile_t = translate(tile, (M = i, N = j)) + @test linearise(tile_t.index, (M = 100, N = 200)) == j * 100 + i + 1 + @test linearise(tile_t.NM.index, (N = 200, M = 100)) == i * 200 + j + 1 + end + end + end + + @testset "Tile iteration" begin + @testset "Subdivide" begin + tile_size = (M = 8, N = 4) + num_tiles = (M = 2, N = 4) + tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N) + + for i = 1 : num_tiles.M * num_tiles.N + t = subdivide(tile, tile_size, i, num_tiles.M * num_tiles.N) + + @test t.offset == (M = 0, N = 0) + @test t.base == (M = tile_size.M * mod(i - 1, num_tiles.M), N = tile_size.N * fld(i - 1, num_tiles.M)) + @test t.size == tile_size + end + end + + @testset "Parallellise" begin + tile_size = (M = 8, N = 4) + num_tiles = (M = 2, N = 8) + tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N) + + for i = 1 : (num_tiles.M * num_tiles.N) ÷ 2 + t1, t2 = parallellise(tile, tile_size, i, (num_tiles.M * num_tiles.N) ÷ 2) + + @test t1.offset == (M = 0, N = 0) + @test t2.offset == (M = 0, N = 4 * tile_size.N) + + @test t1.base == (M = tile_size.M * mod(i - 1, num_tiles.M), N = tile_size.N * fld(i - 1, num_tiles.M)) + @test t2.base == (M = tile_size.M * mod(i - 1, num_tiles.M), N = tile_size.N * fld(i - 1, num_tiles.M)) + + @test t1.size == tile_size + @test t2.size == tile_size + end + end + end +end + +################################################################################ diff --git a/test/runtests.jl b/test/runtests.jl index 1cff84a2..472c771c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -34,6 +34,8 @@ include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") include("device/wmma.jl") +include("device/tiling.jl") +include("device/matmul_kernels.jl") include("nvtx.jl") From 6561ad79de7beb360d927db9a93ddea54e201364 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 25 Apr 2020 19:18:57 +0200 Subject: [PATCH 02/34] Add benchmark scripts and results --- test/perf/matmul_kernels/wmma/cublas.csv | 9 +++++ test/perf/matmul_kernels/wmma/cublas.jl | 33 ++++++++++++++++ test/perf/matmul_kernels/wmma/cublas.sh | 22 +++++++++++ test/perf/matmul_kernels/wmma/cudanative.csv | 9 +++++ test/perf/matmul_kernels/wmma/cudanative.jl | 37 ++++++++++++++++++ test/perf/matmul_kernels/wmma/cudanative.sh | 22 +++++++++++ .../wmma/cutlass-mma-turing.csv | 9 +++++ .../matmul_kernels/wmma/cutlass-mma-turing.sh | 22 +++++++++++ test/perf/matmul_kernels/wmma/cutlass-mma.csv | 9 +++++ test/perf/matmul_kernels/wmma/cutlass-mma.sh | 22 +++++++++++ .../perf/matmul_kernels/wmma/cutlass-wmma.csv | 9 +++++ test/perf/matmul_kernels/wmma/cutlass-wmma.sh | 22 +++++++++++ test/perf/matmul_kernels/wmma/plot.jl | 27 +++++++++++++ test/perf/matmul_kernels/wmma/plot.pdf | Bin 0 -> 20450 bytes 14 files changed, 252 insertions(+) create mode 100644 test/perf/matmul_kernels/wmma/cublas.csv create mode 100644 test/perf/matmul_kernels/wmma/cublas.jl create mode 100755 test/perf/matmul_kernels/wmma/cublas.sh create mode 100644 test/perf/matmul_kernels/wmma/cudanative.csv create mode 100644 test/perf/matmul_kernels/wmma/cudanative.jl create mode 100755 test/perf/matmul_kernels/wmma/cudanative.sh create mode 100644 test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv create mode 100755 test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh create mode 100644 test/perf/matmul_kernels/wmma/cutlass-mma.csv create mode 100755 test/perf/matmul_kernels/wmma/cutlass-mma.sh create mode 100644 test/perf/matmul_kernels/wmma/cutlass-wmma.csv create mode 100755 test/perf/matmul_kernels/wmma/cutlass-wmma.sh create mode 100644 test/perf/matmul_kernels/wmma/plot.jl create mode 100644 test/perf/matmul_kernels/wmma/plot.pdf diff --git a/test/perf/matmul_kernels/wmma/cublas.csv b/test/perf/matmul_kernels/wmma/cublas.csv new file mode 100644 index 00000000..3cc5007d --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cublas.csv @@ -0,0 +1,9 @@ +N,runtime +128,9241.600000 +256,13564.800000 +512,23936.000000 +1024,69990.400000 +2048,459043.200000 +4096,3187926.400000 +8192,24734774.400000 +16384,192036652.800000 diff --git a/test/perf/matmul_kernels/wmma/cublas.jl b/test/perf/matmul_kernels/wmma/cublas.jl new file mode 100644 index 00000000..1e6813f5 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cublas.jl @@ -0,0 +1,33 @@ +using CUDAapi +using CUDAdrv +using CUDAnative +using CUDAnative.MatMul +using CuArrays + +M = parse(Int, ARGS[1]) +N = parse(Int, ARGS[2]) +K = parse(Int, ARGS[3]) + +function benchmark_matmul(a, b, c, d) + CuArrays.@sync begin + CUBLAS.cublasSetMathMode(CUBLAS.handle(), CUBLAS.CUBLAS_TENSOR_OP_MATH) + CUBLAS.cublasGemmEx(CUBLAS.handle(), CUBLAS.CUBLAS_OP_N, CUBLAS.CUBLAS_OP_N, M, N, K, [Float32(1)], a, CUDAapi.R_16F, M, b, CUDAapi.R_16F, K, [Float32(1)], c, CUDAapi.R_32F, M, CUDAapi.R_32F, CUBLAS.CUBLAS_GEMM_DEFAULT) + end +end + +a_h = rand(Float16, (M, K)) / sqrt(Float16(K)) +b_h = rand(Float16, (K, N)) / sqrt(Float16(K)) +c_h = rand(Float32, (M, N)) + +a = CuArray(a_h) +b = CuArray(b_h) +c = CuArray(c_h) +d = similar(c) + +# warmup +benchmark_matmul(a, b, c, d) + +# profile +for i = 1 : 10 + CUDAdrv.@profile benchmark_matmul(a, b, c, d) +end diff --git a/test/perf/matmul_kernels/wmma/cublas.sh b/test/perf/matmul_kernels/wmma/cublas.sh new file mode 100755 index 00000000..4ced9287 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cublas.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cublas.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cublas.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cublas.csv +done diff --git a/test/perf/matmul_kernels/wmma/cudanative.csv b/test/perf/matmul_kernels/wmma/cudanative.csv new file mode 100644 index 00000000..458ac4e8 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative.csv @@ -0,0 +1,9 @@ +N,runtime +128,17462.400000 +256,26332.800000 +512,43344.000000 +1024,87014.400000 +2048,540777.600000 +4096,3967702.400000 +8192,30435030.400000 +16384,236893779.200000 diff --git a/test/perf/matmul_kernels/wmma/cudanative.jl b/test/perf/matmul_kernels/wmma/cudanative.jl new file mode 100644 index 00000000..b17e2560 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative.jl @@ -0,0 +1,37 @@ +using CUDAdrv +using CUDAnative +using CUDAnative.MatMul +using CuArrays + +M = parse(Int, ARGS[1]) +N = parse(Int, ARGS[2]) +K = parse(Int, ARGS[3]) + +function benchmark_matmul(a, b, c, d) + CuArrays.@sync begin + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMAOp{16, 16, 16}, + global_a_layout = Layout.AlignedColMajor{Float16}, + global_c_layout = Layout.AlignedColMajor{Float32}, + ) + MatMul.matmul(a, b, c, d, conf) + end +end + +a_h = rand(Float16, (M, K)) / sqrt(Float16(K)) +b_h = rand(Float16, (K, N)) / sqrt(Float16(K)) +c_h = rand(Float32, (M, N)) + +a = CuArray(a_h) +b = CuArray(b_h) +c = CuArray(c_h) +d = similar(c) + +# warmup +benchmark_matmul(a, b, c, d) + +# profile +for i = 1 : 10 + CUDAdrv.@profile benchmark_matmul(a, b, c, d) +end diff --git a/test/perf/matmul_kernels/wmma/cudanative.sh b/test/perf/matmul_kernels/wmma/cudanative.sh new file mode 100755 index 00000000..f9b5d7ab --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative.csv +done diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv new file mode 100644 index 00000000..c3d363b2 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv @@ -0,0 +1,9 @@ +N,runtime +128,20493.333333 +256,36458.666667 +512,62733.333333 +1024,119813.333333 +2048,465450.666667 +4096,3440157.333333 +8192,26701152.000000 +16384,215024610.666667 diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh new file mode 100755 index 00000000..acb214c0 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +CUTLASS_BUILD_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cutlass-mma-turing.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cutlass-mma-turing.csv +done diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.csv b/test/perf/matmul_kernels/wmma/cutlass-mma.csv new file mode 100644 index 00000000..b994957d --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-mma.csv @@ -0,0 +1,9 @@ +N,runtime +128,20309.333333 +256,33522.666667 +512,59837.333333 +1024,118997.333333 +2048,827818.666667 +4096,6395536.000000 +8192,49197301.333333 +16384,400406416.000000 diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.sh b/test/perf/matmul_kernels/wmma/cutlass-mma.sh new file mode 100755 index 00000000..acc4b1c0 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-mma.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +CUTLASS_BUILD_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cutlass-mma.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cutlass-mma.csv +done diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.csv b/test/perf/matmul_kernels/wmma/cutlass-wmma.csv new file mode 100644 index 00000000..19bf72e4 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-wmma.csv @@ -0,0 +1,9 @@ +N,runtime +128,14274.666667 +256,22589.333333 +512,38648.000000 +1024,79410.666667 +2048,560162.666667 +4096,4084114.666667 +8192,31448712.000000 +16384,406712666.666667 diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh new file mode 100755 index 00000000..8d378dce --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +CUTLASS_BUILD_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cutlass-wmma.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=wmmatensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cutlass-wmma.csv +done diff --git a/test/perf/matmul_kernels/wmma/plot.jl b/test/perf/matmul_kernels/wmma/plot.jl new file mode 100644 index 00000000..85a2c553 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/plot.jl @@ -0,0 +1,27 @@ +using CSV +using DataFrames +using Plots + +pyplot() + +function plot_results(file, label) + df = DataFrame(CSV.File(file)) + + N = df[!, :N] + mean_runtime = df[!, :runtime] .* 1e3 # in ps + + tflops = (2 .* N .^ 3) ./ mean_runtime + + plot!(N, tflops, label=label, xscale=:log2, markershape=:circle) +end + +plot_results("cudanative.csv", "CUDAnative") +plot_results("cublas.csv", "cuBLAS") +plot_results("cutlass-wmma.csv", "CUTLASS (WMMA)") +plot_results("cutlass-mma.csv", "CUTLASS (mma.m8n8k4)") +plot_results("cutlass-mma-turing.csv", "CUTLASS (mma.m16n8k8)") + +title!("Performance of mixed-precision GEMM\nProblem size: N x N x N") +xlabel!("N") +ylabel!("TFLOPS") +savefig("plot.pdf") diff --git a/test/perf/matmul_kernels/wmma/plot.pdf b/test/perf/matmul_kernels/wmma/plot.pdf new file mode 100644 index 0000000000000000000000000000000000000000..9461ca776d038335d5b218955e8a5ab9deaa0b1f GIT binary patch literal 20450 zcmeHvcRZHg8$Zb=n~dtQ$$0i-D?8({S0P*W-g~c*l@-b!Ei09jBH4sgB1Kkaq|o4Z z-;eryBCpTO>-YNo_xCJ44XmPco>cyNjicwI{>~Xh&WVsNdSx69QFs0@{@Q70dmK zl_4e&sE)0F@^Szx*3Q5$ zv7HK(++Dm}Az(MPxP7cX~9K!l)-{t+Q4)j#A1vh$l?MLS18L=dQ= zBOpV0YfBd^K;&xH&KEpwA#iYHw84TNp6=G>PGr8hJ?B(C*zVFi>bsp)6bOk_{E&U+ z=y0XV*(7WDlw59vtQHx=O?u_H#nsz=uWG4Ku$DZAi~i;D6VA27r$gSFo~ly)aD9A> zfWZA$Z=2sP9OVsYa7YudlUF(>9s{8XeYA^`*Zp zW%xDQ6hppyt0o0oBW=@53#MFBXaCJ}LIbg7$Ho%BH$@*MKk;(pp+Lu^ZCWhJb!~Y5 z=dH!>*H>m%dVIITBB~=;w-2?L-5|}ozkcHUPyZmlFM~y&S5{jz3kw!J?68#s(+Mqu z;~g7EjH=$-h0Sg^zl?OJKSkjW=0zmnNijy80v`9q%2UGRweUL(Acu2voPEzqd-u6M_`fi{zgxCEwF zYL$EaoMNL7H3GkoHWQbvhC;rL>RwIE(7vf`5ewxpG8Gm1c5YPqy)5D-@q9U!Pi0j4 zaX)J&!SGl4bEC&d^>a-VETmG7F0*wWcB&S5c7Jk!@f`kpVtg}tyhmp;L_P6_UAtdS z-y#r=J%v#c6zV>oJD?KKzRCzsYoD&El{~8WV&gski_vdW@GeJfHIh)GnuxXf*a~X_ zGyfs6^Y8tN@&pYEd5;KrsstO& z=9CC{E_OOR7fU^khA47YzsqPP@WgVboD1U7WBrmNwm`?lZYu6~Ir)Uh;2Q5mcPKC%uy?y>}vPG>bK&obTT>7rITw0=S#dc|A#33Qo#^bBmq zk3XxRZ&@O1x4iZ+wxZ&T-I-!Cn0d|H$dgTbm9 z8S1SbyKr>gRA`D3yjPm=-jb)DzofA;L0sRso0~Vr%@KPg?{MS?F1%azbfb@LZ^w3i zlPOfjZ(K>Dwwm|Fb78HW6e$V3;PJGl7{R*x$71Zg_^iOhoL=|I=&&7)s%U=8<>OVGt9?aNbh0Mg;PR7?S!X*F-XfP{ zy%oFi@+e!8SF13A%CQW5%Q}x~oS4)~)k>(Zof}GWE%At@>f1VOT}O9DXjZXqmCoem zU7;r~ZN$0Ll@=?#v|;O728BLsn@Ol5?3X5jlBWk41q->}i1q0}-J-v@&5$_SE@@AuC^hP7hz{y1ue_(aq!M z_o2_V+l@^Pja#lizxHg*j=5sM&Z31dYDYr&6AsUnrX$(1`e?IG4AvMl}L$`1|xbRsGk{r2Xl3R1i{{f6l)n}};NTZ)pcbm>|p92T6b z4T%>bL*>onBeUyQJT#%rGsmNl%K4P|UEi9WNj#qb<+TUSRu|Mb=B}kiC{d^_{ zm3;5E9(Su#UD+~JJLGI+WiNca^Zl`$(uvyB)s>WE9ir!JM4j~{IMtFxq$b|pb|{*) zo+PJ$*HyBrg%**}c^Df=r>Qu}b>n}2G~g`0Zt_YRGC-|4;?R~$QC2ZLMnSKIbx+PR zaF>^2(Gz4V*3=c_Mj`bcK+4^nONP^LE=^sVM1*AgJo>!7nk*V#q2r}AvmW%mytk}4 zvy+XR=xVZa|HbyQd_BPvE(=~%BX;BCj+t@{0y^hKCL-Q?Aqo3AdNWtO5GG~J~E4*S~?W?#kxUam^A`I<-S~QO-xYMcKK*vx1>~v zp-v^*TgX4;{*MHmn=CK1*s*G}_37BGiLV&F@7JmOh&>mrzYW9W4vwI%n{1ZLgsn2_&W#D%)@36+a!F1L81`4-1#v6p!+ zs(wM8tg19rxR$0NK(vPg?#|S;Fteoh%8?@Nbf$LR(?{`==aT)55!G@gWA&{p!VXOuVYDF z)w56Pr;B}2u~f~jlwy0>{5<^$Ne-jED#LZr^yS07_T-Un1@&gmUO`S_arL)_h8Cg` zm~7Z}Aq;E&5F<%fW4)o;4d9tzkGlf{CS9umO+8Pa3gk93*VnR8$kkcq zs()JXrlntkM1k$3vt78>1WO`EE*8dNoMl%2$uYcO;sRX)!iY}kX)USZBU7m>W|Nlr zn}<0Eg2M|^XO1<}+E3WNl@qO0ph_8W42g9M`eTYoUNJidu=6%BykFX6b-Uwf$^{!b zH959;Gxb~Deuy6@uD(+d2aSbau=SVOfP=Fj?nQ_o5x;E-#Gls3e}Q2DZvP2{(FqSI zAHzuG14{4FY>0-g$7!eh9HBN6z85e+HzW4AU7AX1_NkDS+<2+4vRHw%7V4afc>NiV zrba6y6T?sBXi(JRbf^_dd6K<<%6;lAzrS?m8`j{znMQ!N&_TdD0xhT|Jkc*9QMZ17~8u!Jhd-9TcewD0ki#Kq3x z-P~>52C-*5XH=36>(a9guT~uC>_eC9Z8U3itEt6s(Wtg7OT||OpC?|w7W}aIvys)0 zBFl&Yqf|A*JeH-)Md`1rbg!~Z9!I@R*T0(WZA~ngC}LH@@X}37Ulm zk|khY?HX(?33oy`$$Su*2?ROqa`gi8w#w2w#i}$Lb9_?L8L8K?Pcx4W9Gi-FIh$w0 z9L#&MapNq1ZX=Ucn682$`{Mhvz4p;hJUDgUyYonQb@L8dv>mF96YWjPwSkeq6X$Kk zj@jf8OR>Kq=5$^_b?H&qJv6F(wSI0(M`5$Q;mXuY{wwDyjMfsQ;<|e0D})xwi;^Zv zAp_%GS)ZsG1Pn_`dx#4ki^V7bB!y3bbAA{1PUrF0GJo16g zw)aG;O)++!X-&KQEDNsNMAO6d<>#sAwbCIuFkPj61P0pF2N4(?1KTAqsbp2!76^sh zdakU}8;iB**hl&$EB#`ueVNpA-eVHjgm+O-$~WEEqNc*>#dZ#-;cjLk!>x?wNFRE z6M}8S;zy!I^kAA* zb$GO;(bHtpiWy}5(Z;8xV=s3R(skk2cB^%V3d*TkWyDppCtF*XM?XbiszW(}>T1vu&y>epHPr z`smpSS|47Xnol)R4UFP!rkGWg*AzdF*G)g|i)GmE$8g*X)N~3{dn%vuT;Z+lDCgTx zC*{AVke#m>v+PkCy@_)9__}p#jCe3dsqzNdO0uPg6{npEokH9CbN@cde*TEV-$oSbB%wFSrTRv32hjg})$F6l7aNwtrN zcHyZf(RJEN4Oi#HuIgMhmOZcbl=Q*Ef?+IH(2;E*Soho0)z3FYKDdtN+q9d$w{U8g1}vI7=0G4X3rO-4UCq(6fxsP7|GB&Nas? zev;#1=I?%Q0Iy=4`rJM&9Q5o*RAkqTl1h3&1E@Q>fKs!=pGl*G+$(Z_j)b1FdoCxg zevcGG(bvD&+#4n$v!np65_SnCW53A7y!o}awNO91^>(SkTqH_4S43-8{i4bY!H2aY zzS0d|$Nc|h5BcX&+J8w%1igzrbYeBg9s$T+7e3wBj*{l@cS$#Fi<#KkC#Rp}*LYT8 zXNcEkim{(%wUR4g&pmB|+-xV0=pSJ^Z>w@Uezs3$nK*lnDQ6#+4hj$u@I6XHQ|t%< zoMho={N`z{2q)O`FrUfRyJ-9pL-@Jr=HO#=ZJ$yJO%J_`znj3oY7@U~SQ+)=h~u$x z;oR%K(Z)~u$*be?7<0QYB5^Mmtj@^WSs6Lc(#B&H$|jew9;zZ!UyGFooz5|5`7^tF zAEuG(Fe9~ZV8j~MNil5en?|8_fluOy~8xfdd%Z5-}koxJ!lvxtBe2EQwK+S{Nom)ovwU>ljgvWbsBqEoSt za1M%2;F#S5rf^AumVli?E+aWU>OHmj^^cUG_db~LbHbzwU4wqE|x=1etGO4wzI z`!Qd>Y2{`Fay=jqpG!~dY=$Wfte;lHUMY(T`*8{CXD_3fJ)P)~aaqZMK?ijq>!cBR z_?NJ2;w9e*2DjD99-bOqlM3CZw+F=!NaP-{sI8i6BttUNq%(Wymi<rX!RHk}XesAun=p|i0s@zu%D#d3i-m><3Y?oyRvmWq)d+*Kb+81J5SQ45YDS zEKJ6D2~{q|5VPa8K)9BGX2ci-n3y!4ar&6yMQuV##~**E#dPgG9EF}&K?{xR0q zcPd#VsCu8FIVe)W>{d%iBuW>i`5b+S)+U-~*Vg*4rnJ=2;mW{I#vsrUgVfB|@Lh@I zLs;=N=u*SH8Th4^kcE7yxIj^S#;<0X4v%c2^q(3tAA@Z z@guyq_;{%H820;!PdS$ME47k^$|nDNuX&{ru9Ga&_ZjE#8(x`}*wWT8R<&VZgDT}g z5>@8}`h2znj6+s^G12?<@SvDPbT?FzO8i%YSO&k8zOUQ6<>Yu`hDho#uX!lz^;`H< zy+?v0Uq1TC$)EE=sVY_&q2RZ4lkdt)FWpWq8G*$V2J6`1=Y3o{lPP)?_C()QGoMw!rd?*2;4)Mq|@*W=^Uy z-zCp)xf3^&5%iQw1XWJ#m8jZG`F6lv*ves#`h<_ z<7W?veFT3{c!NOh1{Vri%I=5RDITl~tc6uH3w+8~@2U8_?I4I$R%bMGfZK(I3Hipc zzkJ*s2E8AMokazTAVxlk4mD|+A}@L zmoJNdkwx*mW~Y(j1O`#iBcho9ruNh$KBG|Kxy$wH5{aqTIO)e-(mBUJGN^cC9a*lw z5WiMH(p4`UE1u0&=xy$1zmmbuan;Kb-L`R2ox{8B!`2PhycSpLzy;#BZahWU3=;ye z`+k*G`X|L=rO)?2i=6YU46|%}Rli1O_x}D@({W`OX>W<9HJs`^~@IyIlhaBvy9O5M%5zQtPMx9aCYciF!QTdUpg@H$EHp|&*O$u|vc!`LMPD@nH)L)G=h`Q$Gb+taRfQA2< z#x@3pR_2(b&egxgg*qrg#OzkVNDKzR&@j~D6h%cMveMJ~ z@$5`z9zSoFXU`<(BC!B4G+_J`iW1)ro;8Hv@e%iv%9>X z7_#uXsn@VtzLIyeG@1WM|B^S^rRGC z*vw`?sPWuNGec^b=h`u*wRb~_@03v&&mTghn3?BsP4}Kv-xzXGYSX&N0p~4do*WSh!WP;csrq*+=rGwdDjbBgZ67y2`I;F29$TleJGPvnpbyHiZtm#X96;^V` zv34+Jfju!&RhgS7Gg#;3b{)Ub$=NpwbX8|2!_u&*alun^sTSSW(zVuMcaqlJYEqQE zo_7rAiV0;8FO@B-1yULEWNS_F=glCbc`H5}s|PLB9_ybhEYbU(`*SQMW0m91NWsuY z18%MC#G$g}dy?A;S68+OlCGV_-E-J|bcs4J@)X-`gh*+tIs+$HBYph$PM3Vby60Dn zExipfy3(y%Nd_gTho@&!RB)e~ON~XU1%Gijnb2B&=xBdonte$aQ@Ua3ZrTw2+Py!= zqFRHk1KM7vYgnNFwn%OzijR&8Pw0IZ;rJuA?G>i-C$Fla+*<;y4wJhGt1`>Efq zjE(N*qvIAoBThN0UNLSHh*6*~Z{#BhekDWa7HHVIZdD}jQ-}YOlWxeCo0@{Qxpxt^ z>if)Zc#NTLdQIN8NdEYXBpNBcn%1dYhN`k3H+rhs@~L)l#E#- zTucZ}iH4C#T2RcxxEuYykLwkyx?++xxtSFtN$kl-nRE%2EaUSwUcW6^D4F<3kG(ig z<1tuQ>?xvk(&=Gd`^Mv-t79kh>bdg5j~mTJ3iorZDjf27tqbd$d#0*-scdn0?1YUp z^3^_z`=F>-Y_|%-Cd$x0h5(&u6wr9V8-NcmI^z!5Xx$K(SreZ2_kA%+#&{)zkhY}$GiU1pXe>s02_zlJ{%qt zwW4>gA0!3|*axs`T^ItR1;d~YSwPl*d2OeMkPzmuiIj=*MFb6O=Qza&>F&X+0CK6lV}Eo$35mx%A_nVCN&w7zCl? zOR*yu*Qt8*1iGRgeV>@`^2uds6-5)?$&MQlY0n;Y4l&$Q$9$jTK1v)OTtKR}ioNC) zefq;RzrQ=_*A=%1pHElaZ6qyrB$;fdPBb#T+x(BQZSj`8nKu*$Nj@tQY7P%R zK;a$n_d^wy=wE!6V!Am`acsFG+tPh7NN&css_U&xY2S|%^XpWa@iwvh$nl^+2#(o3 ztrY%vPw+{JC=h&6u-tT$9a|e(R(+bthxc&B9YVT(La38!1G%&jY40=NS&TE9cJi=S zsDAMW+jQ-q`>l4DJy&co9s=Yy>5kbP@lP*ezD6z1Ft%7Y>0Y|jyQ$>*#q^6(^-^xn z)5lg*M09wXkFdV^o%n6#=z zOk>QHvlWK>qyjXpihKPz*-B8{H2wP9p^RENRHm(1FgfyMCF`+4RZNcSNxFt>qbAtQ#x-7-!QW%@=`RUf?)LK`c zP@}q3l6=F%&3Fg5QgJeQS4I2Pcu(`zns&M3p~Gk?0*56!;+ebVH#VMk@6@kJ(Z&S?<2lLom<-1%P!U|Ctb=7DHxF=4F&>1t2 zjyar9AsDT*LX=%(hQ6n$otU>!ROM{y6*ld!bN$OY@ktGDZ#Su)F6RCCI;gruAThi4 zJ*=jxvphS+h)2Bfdu1+m{9d^sy_v0%>K5I_lhH>*&QN`<(K|k`W8oFIuv&u6>_qr6 zluv1lY->>*a^8O4E>m|mAfJ$P;EZyUK`< zHoDvsCgvnAa=Iaz$=?nzQW?@C+|*{1vTMwC7S5$Gnfa}>TuL<^F~VJJ|)2pWEXwx zFKG1HIC-ungtb#qn}p-^N{Y2o8=0}ip|m71TBzNr4Lg=FLn5@&T4cPr;d|%QLsSuh zDhByN=P6R&$eNtlcu75WQ_ZSS@-~fxD=$+Kv9vDh+2j7FCFB{--coKc9MU(j&#OAb zR@2+;CDmPdc&eY*f&YZ2uIxk4?W+NAzLVp*K&_xm0dE8yS<=Sv9K;FC4W|WC`9LS=%}kfPh)<)4PKrjy;`2V(I|I0YkxvQx&66 zXQaf*f>tu#)9$Ho(&>LJWS$rSO1x*Ylsx-1d{B4+NAI383d#4u?NQ!CPllAXie#?3 zt>0(6f@xEw&IL~><~FxZqU19 zvSVJ%-z%@Y=Vfzw(5|bqHRQ9c0WwXh89m4Q2|K%SRkE&P*67-8^7Nq7cSQ4)HQ_R`>Pr4`wHM#FJ(cwc2`e4DBq!Zl=s4<_q$IlklgKPs5i-HkglZ-stYAC4L4SUT4-{s37ffM zHWR4ibUPM8w%R|kmm#Q2Z+%qZy{pCGGuL5Ud%rPH8DX@p5Z^=JJKvoB#U~4Ky+^_? zgRVqfVS_oK{xD_Uhq+;g>?p;%VowCP12NwI?yC=T&Qf($oH@fV*g&RU)K0JZI%6Xz zK4;)X*Qa-x>&zkB_{fPs{J#eUxPfpmiwX(fJ@wGqs_wwl176O!hV^DO1?79z*aMdp zB6$U>gigx}L(dMV2EX7QDQTHBII+%MEh~L#F{fL*b)B)6zE9}3QXkD-A+p0-bW$4# zUIvaAeJ^Ikea$na32sxX^E3Jqr{NLZ9yj$(SC4os!HO88NlRg&=6aH&dJ%gjT}t_f zz|@!G%P7xa9s!Td0QU-d`=SmK28iMP*TlsF{KvWIq~gxhxIYxpj~z=kNNu~o&BILn zmZut9_xQu;<+{l1ELp;7!_Xnm+Jm_?mm^5wWXt`)KPl?*^W54J^JlN}Uwz&pRxk=E+Gi9Fj0ATN0&eFCZ2?Rx zJeKrB3;R@jP~Zhe?tz%VixuE-S#DW(oOYF2x*}#M|Mk}Gd038&?1l`7#0re$#^+Vc z^VU!Og)Jgmn4#;ImjzEW_-pG^eN2vNTxHO-gK0gDI;(a}&b&9PvpoZGmvy`z8*#(& zvhK|YZ%HDm`ZDY5B*7I<=5vB86wP|)@3m2$N>;t;^HA?mh^!7QbDzE(6ea!r9#QPp z+tU=ddq|<)J-IW46~RmIqSH;uj|X^H!UhV#j&ogcaX)=r5bR* zDWOQw+?0&cZMx9a_~>OW4Mg=aSz20Z8#nnDMdp|=+l-~dsY{$bQvPOo36Zq;UG#KC z4rj$5wB#iIkSxVNH9_Nk%kVCnCBqrY2Ddj1%W|Ln+F2krSsC$UCi-!V!7?k?lx_yC z8|oTD`WM-hT@w;DQ$oGN-khNjt7Uvp4*hmEJ&V+H@gm1k;bU6m?pzBK@o!Ypo`U5~ zm=9gIlT^N)5pYm@<5Q$;*Ugc3D^tNm#3e}T-s#W6ogwFYATl~H`RoFpSLRvr4iLzgizpz_c=Kew)oK zYM?b|GU)V4I)Afo+pjkKwtf(>)ractGc*SVim2V^%R*XWsRS}4T{8^+vL1uW)mv&42$!zYu!7G7mpMGykw$U^&-uj9R}<#Bl1S$moSrn)Ni6jU z!g&ShIL#c=N3ST+Z;WS=%Y-kiwlE!)FLuasL0|Ps3l4rTX!4@OYkiQDEVn&LlY-Ti z@b#e7kx|;-BnoTjSq)kQ8)od9}_Dzos(Im(7`PFJsrAN`yr_xTkvEqlYq z-6~Dbz!jy+IX{fGY1}t;Wuy(-ho6JOzde_H(AfJlz%82j>~NayqSp&)s#6Q^h!*b2 zhzCK^SsfNm^=`#~y^|WZ;8+TOea-q? zfA3x8KH`!w)~tQlJ19sL*<(T{JfRGQQ4nnBad4vIun6i2!tWt=WQ%Rmc~*mbChr;S zWFvFmv@2^EE#*f>M9?Q!nL5o5JvylxPzkX|Y+(X@UXlB-G41o#9&=DYhX8#2cD;ks z?4ks)!-ETCjOFx)!mRT3A9`&&I6apzr`eV@Y)>cgtLW(E|EMZ0u0~Za+*Hj$e$$J4 zp)&`PFR)yYT~povdcw?=h39-Iqlnq8T_OwGb3D9M__AzREBB}DK(Bz=M;gq&_Iob) zNM21=$Dh~^ZdnqVEJ*#_na{e(n!ET(qSkqw*WR0>-<<*Rq88P5l!Q*ZcNp;F)opKO4GmOI}QMAgSc4OJmzQ)oDDPeeSHm{}hd^xra5* zzY=g>v^UrH(gA!H0cS>Ab9d0S67bsuF&N-zso)Iw5rUqRl9FUlc{>{$YtXL|@Jcj- z0Eie0_&Hhv{+cid6zA~hXl((m+;H+o^csxSTD4<$9UkDllfm-7#K?5Z$T^wDUA;8rms0;)O z_*nwpo$?T<0^sckfhs{nfV^seZzTk(4uNU_u9Of_U1gZxC)&|r7A_n|#4uM(# zbwQw3K;1xl5I78|(GG}!hYk>^BLwOMfjW~x0iIj|r%8Ynz@ZWl4A3ht2-F(_2jl|v z1tOsD{ty$gUyiN+NC}uxP(?d;4^L2%fCQnSjAeJ+J%!*f;EK8s3WJ2eMZ{1*CBO>- z!0!{xB5&2Bxh>EaH7gKG+qJV?fxd%6Rn09-|dVK5Mn0ssC<0Y%l!f3*ajYC_e` zoq$gLJDy}v9WM({oaZg*e+x)t#|?L9+zsrkJZ(JyR>0=KrSSW1{hw=E#={cyNfkqg z0U|edJ+m_)K)*nS!6!oCF$?H&dcgyj5IbBSpuc-r1$=b@*I+wK8RrX*)({vll;*xT z1|e`H3Xrvqr?rzlz~~=6_(Nn+!+#k2qaVMwP9q2g2K#>z$o_BZK>}ln0>%#oST>@- zg)$5hLna2CSc?E>X9(c_BNBL-0tPg|C-@x9j|9$FaWQbi0?7Ygf$aBlB#;*jU>&~$ z8UWF77$^Wy6cB)l#6%$?BEWB8{i1=!MZf^OD?x~%$$$W?F$@F^*z}^nJQV>Js|c`s zFfbqu#^7@#5C8`Z!1DrErx{<7OJz5(@kQ14RU~4R#HTfq)wiu(_Q;hW<5X z|5UY;02th<0s-_GD24{P0yqbYgVW&maQ%*fAtU?E2(AVc0H}a&z(Ido__*3FTha+W0T*T98SIczWgo&!MP4;gNg;`qywbnAi&^(8zZ1*5TGRi zq!b{)K;lx05CjMm0h$spSHTn+R1NUs-=SdvdF`asA-HSMKt2s%^mk~QpyI@p)dA+| zPD&R7FpR534_FgBGy`A@*h!f~fHjRPVF3Yzj7wPpYv5N3rx|gVX@GVB{fcXc4A5`D z#)-?jvuWcl9soIPAhXfQH6p`lY2&zuL5e04Kq~12RA@0t1fA;s^l_8F48m zV5aS)fG;xq>X!>J&v$5mUWWj8B5^rgL4A&^=U;#T=&GG(;0F(Oa00OB?2vXa0d-H%Io0OJRy$Z*&IcR~d8UezoThtc;y}ID~=&lYi<)u)^Q7MiE$-JL33P=YWm;el7#w`96dUK>H07 z{`tZM)Y2S+Gn$|>h<~yP!~ZiQ@^{An$o7XG2fO);mwy=qd;Av>?EG)8vB1{%ONdyY zgTEw*ML+;7gcBeZKrg?>4+~(J9TEQ90Iu-w%6uVKA_#fCs z(ZwAAWjoO2_lG-x2LJhxg;TQB5#WV(l#luo!t(@%+xkkk$1T(vw@)V4+vzaLj(_jKYGzYa$1%!V2E%C z=CMpCBI%(Zg`dY3qUl+rwb?vRf5&g_!T37StV>izg4H*!H?#Ps*%XfcZ5QF>_Wz~u z=W_VpoeO^~MCkvn6#m$G{&%GS?ySGJ1vwWlz8M|?c(T%6^>&n5B6iz8qN7z~9#ArLSWQd9tj;Dy0>`N^PiU_QX8&7uJ$V*~kz2C541590ib1{W0p5zcO!2z);psFnBR zL&ElxT`+62nO&n-%A5+ zzCAQi(AR7)4F#A!duV9%-|_d<;IIV(8Sw8P*~5?sP)F^dAw~YCi2x@g zd-4Gm#@>E_bMLRRC?s&ov9~N5M5?>nMTvrVZx2oEFZsdH;O&Gx`Ot{J+W@TBzxs<7 z1rIuQw}C-`hYJ$8-#m-K!Tn-)8)D!~?7cJ;2)*~@LyP@Q69KXRo_wMpg5E`g!{C372OO9j zdu0Ha*C2}AQ&tqj;k#*YU?u%UL;l4d9Jr45R~v8<&_>!*A7~Emp#e@Td&eA(5cx}{ z!03Wj&F=b;u)pRmaE|czeuWePj!E{EMZrM(Z+CqtxX52JKq3F$6HzF@WZqL&ROC;& zc)FY0Ia<5pzQw2PY~upKDG)82Q0~`0wNw~SYCvGc;ivk-J88^3r Ih9cSj0kx*rm;e9( literal 0 HcmV?d00001 From 4fe49e0034e2d694d39d082b9fd514c3865705b0 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 25 Apr 2020 19:19:40 +0200 Subject: [PATCH 03/34] Cleanup --- src/device/matmul_kernels/epilogue.jl | 8 ++-- src/device/matmul_kernels/kernel.jl | 38 +++++++++---------- src/device/matmul_kernels/layout.jl | 41 +++++++++++++++----- src/device/matmul_kernels/operator.jl | 43 +++++++++++---------- src/device/tiling.jl | 54 +++++++++++++++------------ test/device/tiling.jl | 4 +- test/runtests.jl | 22 +++++------ 7 files changed, 118 insertions(+), 92 deletions(-) diff --git a/src/device/matmul_kernels/epilogue.jl b/src/device/matmul_kernels/epilogue.jl index 554f7016..2f14d6b2 100644 --- a/src/device/matmul_kernels/epilogue.jl +++ b/src/device/matmul_kernels/epilogue.jl @@ -23,11 +23,11 @@ struct Default end block_tile = Tile(BLOCK_SHAPE) # Cooperatively store a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of D from shared to global memory within one threadblock - @unroll for warp_tile = parallellise(block_tile.MN, MEM_CD_WARP, warpId, WARPS_PER_BLOCK) - @unroll for thread_tile = parallellise(warp_tile, MEM_CD_THREAD, laneId, 32) - x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile, block_tile.MN.size) + @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) + x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile) x = transform(x, thread_tile) - Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j))) end end end diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index 864780c6..d25c502c 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -28,25 +28,25 @@ function matmul_impl(a, b, c, d, # (1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of C from global to shared memory within one threadblock shmem_c = @cuDynamicSharedMem(Layout.eltype(SHARED_C_LAYOUT), Layout.size(SHARED_C_LAYOUT, block_tile.MN.size)) - @unroll for warp_tile = parallellise(block_tile.MN, MEM_CD_WARP, warpId, WARPS_PER_BLOCK) - @unroll for thread_tile = parallellise(warp_tile, MEM_CD_THREAD, laneId, 32) - x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) + x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j))) x = transf_gl2sh_c(x, thread_tile) - Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile, block_tile.MN.size) + Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile) end end sync_threads() # (2) Load a COMPUTE_WARP.M x COMPUTE_WARP.N tile of C from shared memory into registers - warp_tile = subdivide(block_tile.MN, (M = COMPUTE_WARP.M, N = COMPUTE_WARP.N), warpId, WARPS_PER_BLOCK) + warp_tile = subdivide(block_tile.MN, Tile(COMPUTE_WARP).MN, warpId, WARPS_PER_BLOCK) c_frags = MArray{Tuple{NUM_FRAGMENTS_M, NUM_FRAGMENTS_N}, Operator.fragtype_accum(OPERATOR, SHARED_C_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) - @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile, block_tile.MN.size), tile) + @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -59,33 +59,33 @@ function matmul_impl(a, b, c, d, @unroll for block_k = 0 : block_tile.size.K : gemm_sz.size.K - 1 # (3.1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.K tile of A from global to shared memory within one threadblock - @unroll for warp_tile = parallellise(block_tile.MK, MEM_A_WARP, warpId, WARPS_PER_BLOCK) - @unroll for thread_tile = parallellise(warp_tile, MEM_A_THREAD, laneId, 32) - x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)), gemm_sz.MK.size) + @unroll for warp_tile = parallellise(block_tile.MK, Tile(MEM_A_WARP), warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_A_THREAD), laneId, 32) + x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k))) x = transf_gl2sh_a(x, thread_tile) - Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile, block_tile.MK.size) + Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile) end end # (3.2) Cooperatively load a BLOCK_SHAPE.K x BLOCK_SHAPE.N tile of B from global to shared memory within one threadblock - @unroll for warp_tile = parallellise(block_tile.KN, MEM_B_WARP, warpId, WARPS_PER_BLOCK) - @unroll for thread_tile = parallellise(warp_tile, MEM_B_THREAD, laneId, 32) - x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)), gemm_sz.KN.size) + @unroll for warp_tile = parallellise(block_tile.KN, Tile(MEM_B_WARP), warpId, WARPS_PER_BLOCK) + @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_B_THREAD), laneId, 32) + x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j))) x = transf_gl2sh_b(x, thread_tile) - Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile, block_tile.KN.size) + Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile) end end sync_threads() # (3.3) Calculate a COMPUTE_WARP.M x COMPUTE_WARP.N tile of D, using a COMPUTE_WARP.M x COMPUTE_WARP.N x COMPUTE_WARP.K operation - @unroll for warp_tile = parallellise(block_tile, COMPUTE_WARP, warpId, WARPS_PER_BLOCK) + @unroll for warp_tile = parallellise(block_tile, Tile(COMPUTE_WARP), warpId, WARPS_PER_BLOCK) # (3.3.1) Load a COMPUTE_WARP.M x COMPUTE_WARP.K tile of A from shared memory into registers a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) - @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile, block_tile.MK.size), a_tile) + @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end # (3.3.2) Load a COMPUTE_WARP.K x COMPUTE_WARP.N tile of B from shared memory into registers @@ -93,7 +93,7 @@ function matmul_impl(a, b, c, d, @unroll for j = 1 : NUM_FRAGMENTS_N b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) - @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile, block_tile.KN.size), b_tile) + @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end # (3.3.3) Compute a COMPUTE_WARP.M x COMPUTE_WARP.N x COMPUTE_WARP.K matrix product within one warp @@ -110,12 +110,12 @@ function matmul_impl(a, b, c, d, # (4) Store the COMPUTE_WARP.M x COMPUTE_WARP.N tile of D from registers to shared memory shmem_d = @cuDynamicSharedMem(Layout.eltype(SHARED_D_LAYOUT), Layout.size(SHARED_D_LAYOUT, block_tile.MN.size)) - warp_tile = subdivide(block_tile.MN, (M = COMPUTE_WARP.M, N = COMPUTE_WARP.N), warpId, WARPS_PER_BLOCK) + warp_tile = subdivide(block_tile.MN, Tile(COMPUTE_WARP).MN, warpId, WARPS_PER_BLOCK) @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) - Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile, block_tile.MN.size) + Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 3768c1d9..5a737e90 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -3,6 +3,8 @@ module Layout using CUDAnative using CUDAnative.Tiling +using GPUifyLoops +using StaticArrays # ----------- # Layout base @@ -26,8 +28,8 @@ end @inline eltype(::Type{Padded{L, P}}) where {L, P} = eltype(L) @inline size(::Type{Padded{L, P}}, logical_size::NamedTuple) where {L, P} = size(L, pad_logical_coord(Padded{L, P}, logical_size)) -@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile, pad_logical_coord(Padded{L, P}, logical_size)) -@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile, logical_size::NamedTuple) where {L, P} = store!(L, workspace, value, tile::Tile, pad_logical_coord(Padded{L, P}, logical_size)) +@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile) +@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile) where {L, P} = store!(L, workspace, value, tile::Tile) # --------------- # AlignedColMajor @@ -35,16 +37,35 @@ end struct AlignedColMajor{T} <: LayoutBase{T} end -@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile, logical_size::NamedTuple) where {T} - N = 16 ÷ sizeof(T) - ptr = pointer(workspace, linearise(tile.base, logical_size)) - return vloada(Vec{N, T}, ptr, linearise(tile.offset, logical_size)) +# TODO: cleanup vectorisation +@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} + vec_len = 16 ÷ sizeof(T) + N = (sizeof(T) * vec_len) ÷ sizeof(Float32) + res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) + + @unroll for j = 1 : size[2] + @unroll for i = 1 : vec_len : size[1] + t = translate(tile, (i - 1, j - 1)) + ind = Tuple(t.index) .+ 1 + @inbounds linear_index = LinearIndices(Base.size(workspace))[ind...] + @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace), linear_index) + end + end + + return res end -@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile, logical_size::NamedTuple) where {T} - N = 16 ÷ sizeof(T) - ptr = pointer(workspace, linearise(tile.base, logical_size)) - return vstorea!(Vec{N, T}, ptr, value, linearise(tile.offset, logical_size)) +@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} + vec_len = 16 ÷ sizeof(T) + + @unroll for j = 1 : size[2] + @unroll for i = 1 : vec_len : size[1] + t = translate(tile, (i - 1, j - 1)) + ind = Tuple(t.index) .+ 1 + @inbounds linear_index = LinearIndices(Base.size(workspace))[ind...] + vstorea!(Vec{vec_len, T}, pointer(workspace), value[i, j], linear_index) + end + end end end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index a7cdc132..809990b0 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -9,19 +9,10 @@ using CUDAnative.Tiling # Default definition for padded layouts # ------------------------------------- -# Fragment types -for f in (:fragtype_a, :fragtype_b, :fragtype_accum) +for f in (:fragtype_a, :fragtype_b, :fragtype_accum, :load_a, :load_b, :load_c, :store_d) @eval @inline $f(op, ::Type{Layout.Padded{L, P}}, args...) where {L, P} = $f(op, L, args...) end -# Load fragments -for f in (:load_a, :load_b, :load_c) - @eval @inline $f(op, ::Type{Layout.Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = $f(op, L, workspace, tile, Layout.pad_logical_coord(Layout.Padded{L, P}, logical_size)) -end - -# Store fragments -@inline store_d(op, ::Type{Layout.Padded{L, P}}, workspace, frag, tile::Tile, logical_size::NamedTuple) where {L, P} = store_d(op, L, workspace, frag, tile, Layout.pad_logical_coord(Layout.Padded{L, P}, logical_size)) - # ---- # WMMA # ---- @@ -34,28 +25,36 @@ struct WMMAOp{M, N, K} end @inline fragtype_b(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB} @inline fragtype_accum(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float32}}) = WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator} -function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} +function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ptr = pointer(workspace, linearise(tile.index, logical_size)) - return WMMA.load_a(ptr, logical_size.M, WMMA.ColMajor, conf) + ind = Tuple(tile.index) .+ 1 + @inbounds linear_index = LinearIndices(size(workspace))[ind...] + ptr = pointer(workspace, linear_index) + return WMMA.load_a(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} +function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ptr = pointer(workspace, linearise(tile.index, logical_size)) - return WMMA.load_b(ptr, logical_size.K, WMMA.ColMajor, conf) + ind = Tuple(tile.index) .+ 1 + @inbounds linear_index = LinearIndices(size(workspace))[ind...] + ptr = pointer(workspace, linear_index) + return WMMA.load_b(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile, logical_size::NamedTuple) where {M, N, K} +function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ptr = pointer(workspace, linearise(tile.index, logical_size)) - return WMMA.load_c(ptr, logical_size.M, WMMA.ColMajor, conf) + ind = Tuple(tile.index) .+ 1 + @inbounds linear_index = LinearIndices(size(workspace))[ind...] + ptr = pointer(workspace, linear_index) + return WMMA.load_c(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile, logical_size::NamedTuple) where {M, N, K} +function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ptr = pointer(workspace, linearise(tile.index, logical_size)) - WMMA.store_d(ptr, frag, logical_size.M, WMMA.ColMajor, conf) + ind = Tuple(tile.index) .+ 1 + @inbounds linear_index = LinearIndices(size(workspace))[ind...] + ptr = pointer(workspace, linear_index) + WMMA.store_d(ptr, frag, size(workspace, 1), WMMA.ColMajor, conf) end function mma(::Type{WMMAOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 009cdbf7..d3d47e0d 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -6,7 +6,7 @@ module Tiling export Tile """ - Tile{names, T} + Tile{size, names, T} A [`Tile`](@ref) represents a part of a multidimensional tensor that is contiguous and aligned to the tensor's dimensions. @@ -30,13 +30,12 @@ you intend to keep. For example, to drop the `K` dimension of a tile containing `M`, `N` and `K` dimensions, you can use the syntax `tile.MN`. """ -struct Tile{names, T} +struct Tile{size, names, T} base::NamedTuple{names, T} offset::NamedTuple{names, T} - size::NamedTuple{names, T} end -function Base.show(io::IO, tile::Tile{names, T}) where {names, T} +function Base.show(io::IO, tile::Tile{size, names, T}) where {size, names, T} print(io, "base: ", tile.base, '\n') print(io, "offset: ", tile.offset, '\n') print(io, "size: ", tile.size) @@ -70,12 +69,17 @@ Creates a new [`Tile`](@ref) of the given `size`, with zero `base` and CUDAnative.Tiling.Tile((M = 24, N = 16, K = 4)) ``` """ -Tile(size::NamedTuple{names, T}) where {names, T} = Tile{names, T}(map(x -> 0, size), map(x -> 0, size), size) +@inline Tile(size::NamedTuple{names, T}) where {names, T} = Tile{size, names, T}(map(x -> 0, size), map(x -> 0, size)) -@generated function getproperty_impl(tile::Tile{names, T}, ::Val{sym}) where {names, T, sym} - if sym == :base || sym == :offset || sym == :size +@inline projection_impl(base::NamedTuple{names, T}, offset::NamedTuple{names, T}, size::NamedTuple{names, T}) where {names, T} = Tile{size, names, T}(base, offset) + +@generated function getproperty_impl(tile::Tile{size, names, T}, ::Val{sym}) where {names, T, sym, size} + if sym == :base || sym == :offset # fields return :(getfield(tile, sym)) + elseif sym == :size + # size + return size elseif sym == :index # index: sum of base and offset return :(map(+, getfield(tile, :base), getfield(tile, :offset))) @@ -83,11 +87,13 @@ Tile(size::NamedTuple{names, T}) where {names, T} = Tile{names, T}(map(x -> 0, s # tile projection sym_str = String(sym) names = ntuple(i -> Symbol(sym_str[i]), length(sym_str)) - return :( Tile(NamedTuple{$names}(getfield(tile, :base)), NamedTuple{$names}(getfield(tile, :offset)), NamedTuple{$names}(getfield(tile, :size))) ) + return :( projection_impl(NamedTuple{$names}(getfield(tile, :base)), + NamedTuple{$names}(getfield(tile, :offset)), + NamedTuple{$names}(size)) ) end end -@inline Base.getproperty(tile::Tile{names, T}, sym::Symbol) where {names, T} = getproperty_impl(tile, Val(sym)) +@inline Base.getproperty(tile::Tile{size, names, T}, sym::Symbol) where {names, T, size} = getproperty_impl(tile, Val(sym)) export linearise @@ -117,11 +123,13 @@ Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. - `tile`: The [`Tile`](@ref) to translate. - `offset`: The `offset` in each dimension. """ -@inline function translate(tile::Tile{names, T}, offset::NamedTuple{names, T}) where {names, T} +@inline function translate(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} base = map(+, tile.base, offset) - return Tile(base, tile.offset, tile.size) + return Tile{size, names, T}(base, tile.offset) end +@inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) + # ------------- # TileIterators # ------------- @@ -135,10 +143,9 @@ A [`TileIterator`](@ref) represents an iterator over a set of [`Tile`](@ref)s. See also: [`subdivide`](@ref), [`parallellise`](@ref). """ -struct TileIterator{names, T, N, R} - parent::Tile{names, T} - tile_size::T - subtile_indices::CartesianIndices{N, R} +struct TileIterator{tile_size, parent_size, names, T, S} + parent::Tile{parent_size, names, T} + subtile_indices::S idx::Int32 step::Int32 end @@ -164,16 +171,15 @@ the calling entity. - `idx`: The identity of the calling entity. - `count`: The number of cooperating entities. """ -@inline function parallellise(tile::Tile{names, T}, tiling_size::NamedTuple{names, T}, idx, count) where {names, T} +@inline function parallellise(tile::Tile{size, names, T}, tiling_size::Tile{tile_sz, names, T}, idx, count) where {names, T, size, tile_sz} # Number of tiles along each dimension - num_tiles = map(div, Tuple(tile.size), Tuple(tiling_size)) + num_tiles = map(div, Tuple(size), Tuple(tile_sz)) parent = tile - tile_size = Tuple(tiling_size) subtile_indices = CartesianIndices(num_tiles) step = count - return TileIterator(parent, tile_size, subtile_indices, convert(Int32, idx), convert(Int32, step)) + return TileIterator{tile_sz, size, names, T, typeof(subtile_indices)}(parent, subtile_indices, convert(Int32, idx), convert(Int32, step)) end export subdivide @@ -195,21 +201,21 @@ Returns the [`Tile`](@ref) that the calling entity is responsible for. - `idx`: The identity of the calling entity. - `count`: The number of cooperating entities. """ -@inline function subdivide(tile::Tile{names, T}, tiling_size::NamedTuple{names, T}, idx, count) where {names, T} +@inline function subdivide(tile::Tile{size, names, T}, tiling_size::Tile{tile_sz, names, T}, idx, count) where {names, T, size, tile_sz} return iterate(parallellise(tile, tiling_size, idx, count))[1] end -@inline function Base.iterate(it::TileIterator{names, T, N, R}, state = 1) where {names, T, N, R} +@inline function Base.iterate(it::TileIterator{tile_size, parent_size, names, T, S}, state = 1) where {tile_size, parent_size, names, T, S} if state > length(it.subtile_indices) return nothing end # Calculate base and offset in number of tiles - @inbounds base = Tuple(it.parent.base) .+ (Tuple(it.subtile_indices[it.idx]) .- 1) .* Tuple(it.tile_size) - @inbounds offset = Tuple(it.parent.offset) .+ (Tuple(it.subtile_indices[state]) .- 1) .* Tuple(it.tile_size) + @inbounds base = Tuple(it.parent.base) .+ (Tuple(it.subtile_indices[it.idx]) .- 1) .* Tuple(tile_size) + @inbounds offset = Tuple(it.parent.offset) .+ (Tuple(it.subtile_indices[state]) .- 1) .* Tuple(tile_size) # Create tile - tile = Tile{names, T}(NamedTuple{names, T}(base), NamedTuple{names, T}(offset), NamedTuple{names, T}(it.tile_size)) + tile = Tile{tile_size, names, T}(NamedTuple{names, T}(base), NamedTuple{names, T}(offset)) return (tile, state + it.step) end diff --git a/test/device/tiling.jl b/test/device/tiling.jl index 6b269d60..69dd81d5 100644 --- a/test/device/tiling.jl +++ b/test/device/tiling.jl @@ -39,7 +39,7 @@ using CUDAnative.Tiling tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N) for i = 1 : num_tiles.M * num_tiles.N - t = subdivide(tile, tile_size, i, num_tiles.M * num_tiles.N) + t = subdivide(tile, Tile(tile_size), i, num_tiles.M * num_tiles.N) @test t.offset == (M = 0, N = 0) @test t.base == (M = tile_size.M * mod(i - 1, num_tiles.M), N = tile_size.N * fld(i - 1, num_tiles.M)) @@ -53,7 +53,7 @@ using CUDAnative.Tiling tile = Tile(M = num_tiles.M * tile_size.M, N = num_tiles.N * tile_size.N) for i = 1 : (num_tiles.M * num_tiles.N) ÷ 2 - t1, t2 = parallellise(tile, tile_size, i, (num_tiles.M * num_tiles.N) ÷ 2) + t1, t2 = parallellise(tile, Tile(tile_size), i, (num_tiles.M * num_tiles.N) ÷ 2) @test t1.offset == (M = 0, N = 0) @test t2.offset == (M = 0, N = 4 * tile_size.N) diff --git a/test/runtests.jl b/test/runtests.jl index 472c771c..58a8b5e9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,22 +23,22 @@ if haskey(ENV, "CI") && haskey(ENV, "JULIA_CUDA_VERSION") end length(devices()) > 0 || error("The CUDAnative.jl test suite requires a CUDA device") -include("init.jl") -include("pointer.jl") -include("codegen.jl") +#= include("init.jl") =# +#= include("pointer.jl") =# +#= include("codegen.jl") =# capability(device()) >= v"2.0" || error("The CUDAnative.jl test suite requires a CUDA device with compute capability 2.0 or higher") -include("device/codegen.jl") -include("device/execution.jl") -include("device/pointer.jl") -include("device/array.jl") -include("device/cuda.jl") -include("device/wmma.jl") +#= include("device/codegen.jl") =# +#= include("device/execution.jl") =# +#= include("device/pointer.jl") =# +#= include("device/array.jl") =# +#= include("device/cuda.jl") =# +#= include("device/wmma.jl") =# include("device/tiling.jl") include("device/matmul_kernels.jl") -include("nvtx.jl") +#= include("nvtx.jl") =# -include("examples.jl") +#= include("examples.jl") =# end From ca01bafff049981f374a231e606b6a4af3e628d9 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 25 Apr 2020 21:15:38 +0200 Subject: [PATCH 04/34] Fix performance regression --- src/device/matmul_kernels/layout.jl | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 5a737e90..5a0c4e44 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -46,9 +46,14 @@ struct AlignedColMajor{T} <: LayoutBase{T} end @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - ind = Tuple(t.index) .+ 1 - @inbounds linear_index = LinearIndices(Base.size(workspace))[ind...] - @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace), linear_index) + + base = Tuple(t.base) .+ 1 + @inbounds linear_base = LinearIndices(Base.size(workspace))[base...] + + offset = Tuple(t.offset) .+ 1 + @inbounds linear_offset = LinearIndices(Base.size(workspace))[offset...] + + @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) end end @@ -61,9 +66,14 @@ end @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - ind = Tuple(t.index) .+ 1 - @inbounds linear_index = LinearIndices(Base.size(workspace))[ind...] - vstorea!(Vec{vec_len, T}, pointer(workspace), value[i, j], linear_index) + + base = Tuple(t.base) .+ 1 + @inbounds linear_base = LinearIndices(Base.size(workspace))[base...] + + offset = Tuple(t.offset) .+ 1 + @inbounds linear_offset = LinearIndices(Base.size(workspace))[offset...] + + vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) end end end From 8911e98ea6af28457ca63c24287d2e45cab5d320 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 25 Apr 2020 22:38:09 +0200 Subject: [PATCH 05/34] Disable verification in CUTLASS profiles --- test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh | 2 +- test/perf/matmul_kernels/wmma/cutlass-mma.sh | 2 +- test/perf/matmul_kernels/wmma/cutlass-wmma.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh index acb214c0..c3a458ae 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-mma-turing.csv done diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.sh b/test/perf/matmul_kernels/wmma/cutlass-mma.sh index acc4b1c0..c98eef94 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-mma.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-mma.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-mma.csv done diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh index 8d378dce..96d5dbf3 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=wmmatensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=wmmatensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-wmma.csv done From 899edb5d5dcbebf2d43551ece8cd00148368cf5f Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 25 Apr 2020 22:44:56 +0200 Subject: [PATCH 06/34] Set kernel names directly --- test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh | 2 +- test/perf/matmul_kernels/wmma/cutlass-mma.sh | 2 +- test/perf/matmul_kernels/wmma/cutlass-wmma.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh index c3a458ae..bfeae1d8 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=16 --inst_n=8 --inst_k=8 --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false --kernels=cutlass_tensorop_s1688gemm_f16_128x128_nn 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-mma-turing.csv done diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.sh b/test/perf/matmul_kernels/wmma/cutlass-mma.sh index c98eef94..e6483419 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-mma.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-mma.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=tensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --inst_m=8 --inst_n=8 --inst_k=4 --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false --kernels=cutlass_tensorop_s884gemm_f16_128x128_nn 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-mma.csv done diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh index 96d5dbf3..d6f53268 100755 --- a/test/perf/matmul_kernels/wmma/cutlass-wmma.sh +++ b/test/perf/matmul_kernels/wmma/cutlass-wmma.sh @@ -16,7 +16,7 @@ for i in {7..14}; do N=$((2**i)) # runtime in ns - runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --op_class=wmmatensorop --A=f16:col --B=f16:col --C=f32 --accum=f32 --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/tools/profiler/cutlass_profiler --m=$N --n=$N --k=$N --warmup-iterations=1 --profiling-iterations=10 --verification-enabled=false --kernels=cutlass_wmma_tensorop_s161616gemm_f16_128x128_nn 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') printf "$N,$runtime\n" >>cutlass-wmma.csv done From c2d52ee88a5cb4202b6fa863a0d454d0f6b2c8ea Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 29 Apr 2020 22:37:30 +0200 Subject: [PATCH 07/34] Add generic_matmul FP32 benchmark --- .../wmma/cudanative-generic-fp32.sh | 22 ++++++++++++++++ .../matmul_kernels/wmma/cudanative-generic.jl | 25 +++++++++++++++++++ 2 files changed, 47 insertions(+) create mode 100755 test/perf/matmul_kernels/wmma/cudanative-generic-fp32.sh create mode 100644 test/perf/matmul_kernels/wmma/cudanative-generic.jl diff --git a/test/perf/matmul_kernels/wmma/cudanative-generic-fp32.sh b/test/perf/matmul_kernels/wmma/cudanative-generic-fp32.sh new file mode 100755 index 00000000..06343260 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative-generic-fp32.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp32.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP32 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp32.csv +done diff --git a/test/perf/matmul_kernels/wmma/cudanative-generic.jl b/test/perf/matmul_kernels/wmma/cudanative-generic.jl new file mode 100644 index 00000000..2adea000 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative-generic.jl @@ -0,0 +1,25 @@ +using CUDAnative, CuArrays, GPUArrays, CUDAdrv; + +M = parse(Int, ARGS[1]); +N = parse(Int, ARGS[2]); +K = parse(Int, ARGS[3]); + +if ARGS[4] == "FP32" + T = Float32; +elseif ARGS[4] == "FP16" + T = Float16; +else + error("Invalid type: $(ARGS[4])"); +end + +a = CuArray(rand(T, (M, K))); +b = CuArray(rand(T, (K, N))); +c = CuArray(rand(T, (M, N))); + +# warmup +GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) + +# profile +for i = 1 : 10 + CUDAdrv.@profile GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) +end From 6b8562d3462cb7e8de0e737ea30f69141cbdaa38 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 29 Apr 2020 22:51:07 +0200 Subject: [PATCH 08/34] Remove output files --- test/perf/matmul_kernels/wmma/.gitignore | 2 ++ test/perf/matmul_kernels/wmma/cublas.csv | 9 --------- test/perf/matmul_kernels/wmma/cudanative.csv | 9 --------- .../matmul_kernels/wmma/cutlass-mma-turing.csv | 9 --------- test/perf/matmul_kernels/wmma/cutlass-mma.csv | 9 --------- test/perf/matmul_kernels/wmma/cutlass-wmma.csv | 9 --------- test/perf/matmul_kernels/wmma/plot.pdf | Bin 20450 -> 0 bytes 7 files changed, 2 insertions(+), 45 deletions(-) create mode 100644 test/perf/matmul_kernels/wmma/.gitignore delete mode 100644 test/perf/matmul_kernels/wmma/cublas.csv delete mode 100644 test/perf/matmul_kernels/wmma/cudanative.csv delete mode 100644 test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv delete mode 100644 test/perf/matmul_kernels/wmma/cutlass-mma.csv delete mode 100644 test/perf/matmul_kernels/wmma/cutlass-wmma.csv delete mode 100644 test/perf/matmul_kernels/wmma/plot.pdf diff --git a/test/perf/matmul_kernels/wmma/.gitignore b/test/perf/matmul_kernels/wmma/.gitignore new file mode 100644 index 00000000..5919a65f --- /dev/null +++ b/test/perf/matmul_kernels/wmma/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.pdf diff --git a/test/perf/matmul_kernels/wmma/cublas.csv b/test/perf/matmul_kernels/wmma/cublas.csv deleted file mode 100644 index 3cc5007d..00000000 --- a/test/perf/matmul_kernels/wmma/cublas.csv +++ /dev/null @@ -1,9 +0,0 @@ -N,runtime -128,9241.600000 -256,13564.800000 -512,23936.000000 -1024,69990.400000 -2048,459043.200000 -4096,3187926.400000 -8192,24734774.400000 -16384,192036652.800000 diff --git a/test/perf/matmul_kernels/wmma/cudanative.csv b/test/perf/matmul_kernels/wmma/cudanative.csv deleted file mode 100644 index 458ac4e8..00000000 --- a/test/perf/matmul_kernels/wmma/cudanative.csv +++ /dev/null @@ -1,9 +0,0 @@ -N,runtime -128,17462.400000 -256,26332.800000 -512,43344.000000 -1024,87014.400000 -2048,540777.600000 -4096,3967702.400000 -8192,30435030.400000 -16384,236893779.200000 diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv b/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv deleted file mode 100644 index c3d363b2..00000000 --- a/test/perf/matmul_kernels/wmma/cutlass-mma-turing.csv +++ /dev/null @@ -1,9 +0,0 @@ -N,runtime -128,20493.333333 -256,36458.666667 -512,62733.333333 -1024,119813.333333 -2048,465450.666667 -4096,3440157.333333 -8192,26701152.000000 -16384,215024610.666667 diff --git a/test/perf/matmul_kernels/wmma/cutlass-mma.csv b/test/perf/matmul_kernels/wmma/cutlass-mma.csv deleted file mode 100644 index b994957d..00000000 --- a/test/perf/matmul_kernels/wmma/cutlass-mma.csv +++ /dev/null @@ -1,9 +0,0 @@ -N,runtime -128,20309.333333 -256,33522.666667 -512,59837.333333 -1024,118997.333333 -2048,827818.666667 -4096,6395536.000000 -8192,49197301.333333 -16384,400406416.000000 diff --git a/test/perf/matmul_kernels/wmma/cutlass-wmma.csv b/test/perf/matmul_kernels/wmma/cutlass-wmma.csv deleted file mode 100644 index 19bf72e4..00000000 --- a/test/perf/matmul_kernels/wmma/cutlass-wmma.csv +++ /dev/null @@ -1,9 +0,0 @@ -N,runtime -128,14274.666667 -256,22589.333333 -512,38648.000000 -1024,79410.666667 -2048,560162.666667 -4096,4084114.666667 -8192,31448712.000000 -16384,406712666.666667 diff --git a/test/perf/matmul_kernels/wmma/plot.pdf b/test/perf/matmul_kernels/wmma/plot.pdf deleted file mode 100644 index 9461ca776d038335d5b218955e8a5ab9deaa0b1f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 20450 zcmeHvcRZHg8$Zb=n~dtQ$$0i-D?8({S0P*W-g~c*l@-b!Ei09jBH4sgB1Kkaq|o4Z z-;eryBCpTO>-YNo_xCJ44XmPco>cyNjicwI{>~Xh&WVsNdSx69QFs0@{@Q70dmK zl_4e&sE)0F@^Szx*3Q5$ zv7HK(++Dm}Az(MPxP7cX~9K!l)-{t+Q4)j#A1vh$l?MLS18L=dQ= zBOpV0YfBd^K;&xH&KEpwA#iYHw84TNp6=G>PGr8hJ?B(C*zVFi>bsp)6bOk_{E&U+ z=y0XV*(7WDlw59vtQHx=O?u_H#nsz=uWG4Ku$DZAi~i;D6VA27r$gSFo~ly)aD9A> zfWZA$Z=2sP9OVsYa7YudlUF(>9s{8XeYA^`*Zp zW%xDQ6hppyt0o0oBW=@53#MFBXaCJ}LIbg7$Ho%BH$@*MKk;(pp+Lu^ZCWhJb!~Y5 z=dH!>*H>m%dVIITBB~=;w-2?L-5|}ozkcHUPyZmlFM~y&S5{jz3kw!J?68#s(+Mqu z;~g7EjH=$-h0Sg^zl?OJKSkjW=0zmnNijy80v`9q%2UGRweUL(Acu2voPEzqd-u6M_`fi{zgxCEwF zYL$EaoMNL7H3GkoHWQbvhC;rL>RwIE(7vf`5ewxpG8Gm1c5YPqy)5D-@q9U!Pi0j4 zaX)J&!SGl4bEC&d^>a-VETmG7F0*wWcB&S5c7Jk!@f`kpVtg}tyhmp;L_P6_UAtdS z-y#r=J%v#c6zV>oJD?KKzRCzsYoD&El{~8WV&gski_vdW@GeJfHIh)GnuxXf*a~X_ zGyfs6^Y8tN@&pYEd5;KrsstO& z=9CC{E_OOR7fU^khA47YzsqPP@WgVboD1U7WBrmNwm`?lZYu6~Ir)Uh;2Q5mcPKC%uy?y>}vPG>bK&obTT>7rITw0=S#dc|A#33Qo#^bBmq zk3XxRZ&@O1x4iZ+wxZ&T-I-!Cn0d|H$dgTbm9 z8S1SbyKr>gRA`D3yjPm=-jb)DzofA;L0sRso0~Vr%@KPg?{MS?F1%azbfb@LZ^w3i zlPOfjZ(K>Dwwm|Fb78HW6e$V3;PJGl7{R*x$71Zg_^iOhoL=|I=&&7)s%U=8<>OVGt9?aNbh0Mg;PR7?S!X*F-XfP{ zy%oFi@+e!8SF13A%CQW5%Q}x~oS4)~)k>(Zof}GWE%At@>f1VOT}O9DXjZXqmCoem zU7;r~ZN$0Ll@=?#v|;O728BLsn@Ol5?3X5jlBWk41q->}i1q0}-J-v@&5$_SE@@AuC^hP7hz{y1ue_(aq!M z_o2_V+l@^Pja#lizxHg*j=5sM&Z31dYDYr&6AsUnrX$(1`e?IG4AvMl}L$`1|xbRsGk{r2Xl3R1i{{f6l)n}};NTZ)pcbm>|p92T6b z4T%>bL*>onBeUyQJT#%rGsmNl%K4P|UEi9WNj#qb<+TUSRu|Mb=B}kiC{d^_{ zm3;5E9(Su#UD+~JJLGI+WiNca^Zl`$(uvyB)s>WE9ir!JM4j~{IMtFxq$b|pb|{*) zo+PJ$*HyBrg%**}c^Df=r>Qu}b>n}2G~g`0Zt_YRGC-|4;?R~$QC2ZLMnSKIbx+PR zaF>^2(Gz4V*3=c_Mj`bcK+4^nONP^LE=^sVM1*AgJo>!7nk*V#q2r}AvmW%mytk}4 zvy+XR=xVZa|HbyQd_BPvE(=~%BX;BCj+t@{0y^hKCL-Q?Aqo3AdNWtO5GG~J~E4*S~?W?#kxUam^A`I<-S~QO-xYMcKK*vx1>~v zp-v^*TgX4;{*MHmn=CK1*s*G}_37BGiLV&F@7JmOh&>mrzYW9W4vwI%n{1ZLgsn2_&W#D%)@36+a!F1L81`4-1#v6p!+ zs(wM8tg19rxR$0NK(vPg?#|S;Fteoh%8?@Nbf$LR(?{`==aT)55!G@gWA&{p!VXOuVYDF z)w56Pr;B}2u~f~jlwy0>{5<^$Ne-jED#LZr^yS07_T-Un1@&gmUO`S_arL)_h8Cg` zm~7Z}Aq;E&5F<%fW4)o;4d9tzkGlf{CS9umO+8Pa3gk93*VnR8$kkcq zs()JXrlntkM1k$3vt78>1WO`EE*8dNoMl%2$uYcO;sRX)!iY}kX)USZBU7m>W|Nlr zn}<0Eg2M|^XO1<}+E3WNl@qO0ph_8W42g9M`eTYoUNJidu=6%BykFX6b-Uwf$^{!b zH959;Gxb~Deuy6@uD(+d2aSbau=SVOfP=Fj?nQ_o5x;E-#Gls3e}Q2DZvP2{(FqSI zAHzuG14{4FY>0-g$7!eh9HBN6z85e+HzW4AU7AX1_NkDS+<2+4vRHw%7V4afc>NiV zrba6y6T?sBXi(JRbf^_dd6K<<%6;lAzrS?m8`j{znMQ!N&_TdD0xhT|Jkc*9QMZ17~8u!Jhd-9TcewD0ki#Kq3x z-P~>52C-*5XH=36>(a9guT~uC>_eC9Z8U3itEt6s(Wtg7OT||OpC?|w7W}aIvys)0 zBFl&Yqf|A*JeH-)Md`1rbg!~Z9!I@R*T0(WZA~ngC}LH@@X}37Ulm zk|khY?HX(?33oy`$$Su*2?ROqa`gi8w#w2w#i}$Lb9_?L8L8K?Pcx4W9Gi-FIh$w0 z9L#&MapNq1ZX=Ucn682$`{Mhvz4p;hJUDgUyYonQb@L8dv>mF96YWjPwSkeq6X$Kk zj@jf8OR>Kq=5$^_b?H&qJv6F(wSI0(M`5$Q;mXuY{wwDyjMfsQ;<|e0D})xwi;^Zv zAp_%GS)ZsG1Pn_`dx#4ki^V7bB!y3bbAA{1PUrF0GJo16g zw)aG;O)++!X-&KQEDNsNMAO6d<>#sAwbCIuFkPj61P0pF2N4(?1KTAqsbp2!76^sh zdakU}8;iB**hl&$EB#`ueVNpA-eVHjgm+O-$~WEEqNc*>#dZ#-;cjLk!>x?wNFRE z6M}8S;zy!I^kAA* zb$GO;(bHtpiWy}5(Z;8xV=s3R(skk2cB^%V3d*TkWyDppCtF*XM?XbiszW(}>T1vu&y>epHPr z`smpSS|47Xnol)R4UFP!rkGWg*AzdF*G)g|i)GmE$8g*X)N~3{dn%vuT;Z+lDCgTx zC*{AVke#m>v+PkCy@_)9__}p#jCe3dsqzNdO0uPg6{npEokH9CbN@cde*TEV-$oSbB%wFSrTRv32hjg})$F6l7aNwtrN zcHyZf(RJEN4Oi#HuIgMhmOZcbl=Q*Ef?+IH(2;E*Soho0)z3FYKDdtN+q9d$w{U8g1}vI7=0G4X3rO-4UCq(6fxsP7|GB&Nas? zev;#1=I?%Q0Iy=4`rJM&9Q5o*RAkqTl1h3&1E@Q>fKs!=pGl*G+$(Z_j)b1FdoCxg zevcGG(bvD&+#4n$v!np65_SnCW53A7y!o}awNO91^>(SkTqH_4S43-8{i4bY!H2aY zzS0d|$Nc|h5BcX&+J8w%1igzrbYeBg9s$T+7e3wBj*{l@cS$#Fi<#KkC#Rp}*LYT8 zXNcEkim{(%wUR4g&pmB|+-xV0=pSJ^Z>w@Uezs3$nK*lnDQ6#+4hj$u@I6XHQ|t%< zoMho={N`z{2q)O`FrUfRyJ-9pL-@Jr=HO#=ZJ$yJO%J_`znj3oY7@U~SQ+)=h~u$x z;oR%K(Z)~u$*be?7<0QYB5^Mmtj@^WSs6Lc(#B&H$|jew9;zZ!UyGFooz5|5`7^tF zAEuG(Fe9~ZV8j~MNil5en?|8_fluOy~8xfdd%Z5-}koxJ!lvxtBe2EQwK+S{Nom)ovwU>ljgvWbsBqEoSt za1M%2;F#S5rf^AumVli?E+aWU>OHmj^^cUG_db~LbHbzwU4wqE|x=1etGO4wzI z`!Qd>Y2{`Fay=jqpG!~dY=$Wfte;lHUMY(T`*8{CXD_3fJ)P)~aaqZMK?ijq>!cBR z_?NJ2;w9e*2DjD99-bOqlM3CZw+F=!NaP-{sI8i6BttUNq%(Wymi<rX!RHk}XesAun=p|i0s@zu%D#d3i-m><3Y?oyRvmWq)d+*Kb+81J5SQ45YDS zEKJ6D2~{q|5VPa8K)9BGX2ci-n3y!4ar&6yMQuV##~**E#dPgG9EF}&K?{xR0q zcPd#VsCu8FIVe)W>{d%iBuW>i`5b+S)+U-~*Vg*4rnJ=2;mW{I#vsrUgVfB|@Lh@I zLs;=N=u*SH8Th4^kcE7yxIj^S#;<0X4v%c2^q(3tAA@Z z@guyq_;{%H820;!PdS$ME47k^$|nDNuX&{ru9Ga&_ZjE#8(x`}*wWT8R<&VZgDT}g z5>@8}`h2znj6+s^G12?<@SvDPbT?FzO8i%YSO&k8zOUQ6<>Yu`hDho#uX!lz^;`H< zy+?v0Uq1TC$)EE=sVY_&q2RZ4lkdt)FWpWq8G*$V2J6`1=Y3o{lPP)?_C()QGoMw!rd?*2;4)Mq|@*W=^Uy z-zCp)xf3^&5%iQw1XWJ#m8jZG`F6lv*ves#`h<_ z<7W?veFT3{c!NOh1{Vri%I=5RDITl~tc6uH3w+8~@2U8_?I4I$R%bMGfZK(I3Hipc zzkJ*s2E8AMokazTAVxlk4mD|+A}@L zmoJNdkwx*mW~Y(j1O`#iBcho9ruNh$KBG|Kxy$wH5{aqTIO)e-(mBUJGN^cC9a*lw z5WiMH(p4`UE1u0&=xy$1zmmbuan;Kb-L`R2ox{8B!`2PhycSpLzy;#BZahWU3=;ye z`+k*G`X|L=rO)?2i=6YU46|%}Rli1O_x}D@({W`OX>W<9HJs`^~@IyIlhaBvy9O5M%5zQtPMx9aCYciF!QTdUpg@H$EHp|&*O$u|vc!`LMPD@nH)L)G=h`Q$Gb+taRfQA2< z#x@3pR_2(b&egxgg*qrg#OzkVNDKzR&@j~D6h%cMveMJ~ z@$5`z9zSoFXU`<(BC!B4G+_J`iW1)ro;8Hv@e%iv%9>X z7_#uXsn@VtzLIyeG@1WM|B^S^rRGC z*vw`?sPWuNGec^b=h`u*wRb~_@03v&&mTghn3?BsP4}Kv-xzXGYSX&N0p~4do*WSh!WP;csrq*+=rGwdDjbBgZ67y2`I;F29$TleJGPvnpbyHiZtm#X96;^V` zv34+Jfju!&RhgS7Gg#;3b{)Ub$=NpwbX8|2!_u&*alun^sTSSW(zVuMcaqlJYEqQE zo_7rAiV0;8FO@B-1yULEWNS_F=glCbc`H5}s|PLB9_ybhEYbU(`*SQMW0m91NWsuY z18%MC#G$g}dy?A;S68+OlCGV_-E-J|bcs4J@)X-`gh*+tIs+$HBYph$PM3Vby60Dn zExipfy3(y%Nd_gTho@&!RB)e~ON~XU1%Gijnb2B&=xBdonte$aQ@Ua3ZrTw2+Py!= zqFRHk1KM7vYgnNFwn%OzijR&8Pw0IZ;rJuA?G>i-C$Fla+*<;y4wJhGt1`>Efq zjE(N*qvIAoBThN0UNLSHh*6*~Z{#BhekDWa7HHVIZdD}jQ-}YOlWxeCo0@{Qxpxt^ z>if)Zc#NTLdQIN8NdEYXBpNBcn%1dYhN`k3H+rhs@~L)l#E#- zTucZ}iH4C#T2RcxxEuYykLwkyx?++xxtSFtN$kl-nRE%2EaUSwUcW6^D4F<3kG(ig z<1tuQ>?xvk(&=Gd`^Mv-t79kh>bdg5j~mTJ3iorZDjf27tqbd$d#0*-scdn0?1YUp z^3^_z`=F>-Y_|%-Cd$x0h5(&u6wr9V8-NcmI^z!5Xx$K(SreZ2_kA%+#&{)zkhY}$GiU1pXe>s02_zlJ{%qt zwW4>gA0!3|*axs`T^ItR1;d~YSwPl*d2OeMkPzmuiIj=*MFb6O=Qza&>F&X+0CK6lV}Eo$35mx%A_nVCN&w7zCl? zOR*yu*Qt8*1iGRgeV>@`^2uds6-5)?$&MQlY0n;Y4l&$Q$9$jTK1v)OTtKR}ioNC) zefq;RzrQ=_*A=%1pHElaZ6qyrB$;fdPBb#T+x(BQZSj`8nKu*$Nj@tQY7P%R zK;a$n_d^wy=wE!6V!Am`acsFG+tPh7NN&css_U&xY2S|%^XpWa@iwvh$nl^+2#(o3 ztrY%vPw+{JC=h&6u-tT$9a|e(R(+bthxc&B9YVT(La38!1G%&jY40=NS&TE9cJi=S zsDAMW+jQ-q`>l4DJy&co9s=Yy>5kbP@lP*ezD6z1Ft%7Y>0Y|jyQ$>*#q^6(^-^xn z)5lg*M09wXkFdV^o%n6#=z zOk>QHvlWK>qyjXpihKPz*-B8{H2wP9p^RENRHm(1FgfyMCF`+4RZNcSNxFt>qbAtQ#x-7-!QW%@=`RUf?)LK`c zP@}q3l6=F%&3Fg5QgJeQS4I2Pcu(`zns&M3p~Gk?0*56!;+ebVH#VMk@6@kJ(Z&S?<2lLom<-1%P!U|Ctb=7DHxF=4F&>1t2 zjyar9AsDT*LX=%(hQ6n$otU>!ROM{y6*ld!bN$OY@ktGDZ#Su)F6RCCI;gruAThi4 zJ*=jxvphS+h)2Bfdu1+m{9d^sy_v0%>K5I_lhH>*&QN`<(K|k`W8oFIuv&u6>_qr6 zluv1lY->>*a^8O4E>m|mAfJ$P;EZyUK`< zHoDvsCgvnAa=Iaz$=?nzQW?@C+|*{1vTMwC7S5$Gnfa}>TuL<^F~VJJ|)2pWEXwx zFKG1HIC-ungtb#qn}p-^N{Y2o8=0}ip|m71TBzNr4Lg=FLn5@&T4cPr;d|%QLsSuh zDhByN=P6R&$eNtlcu75WQ_ZSS@-~fxD=$+Kv9vDh+2j7FCFB{--coKc9MU(j&#OAb zR@2+;CDmPdc&eY*f&YZ2uIxk4?W+NAzLVp*K&_xm0dE8yS<=Sv9K;FC4W|WC`9LS=%}kfPh)<)4PKrjy;`2V(I|I0YkxvQx&66 zXQaf*f>tu#)9$Ho(&>LJWS$rSO1x*Ylsx-1d{B4+NAI383d#4u?NQ!CPllAXie#?3 zt>0(6f@xEw&IL~><~FxZqU19 zvSVJ%-z%@Y=Vfzw(5|bqHRQ9c0WwXh89m4Q2|K%SRkE&P*67-8^7Nq7cSQ4)HQ_R`>Pr4`wHM#FJ(cwc2`e4DBq!Zl=s4<_q$IlklgKPs5i-HkglZ-stYAC4L4SUT4-{s37ffM zHWR4ibUPM8w%R|kmm#Q2Z+%qZy{pCGGuL5Ud%rPH8DX@p5Z^=JJKvoB#U~4Ky+^_? zgRVqfVS_oK{xD_Uhq+;g>?p;%VowCP12NwI?yC=T&Qf($oH@fV*g&RU)K0JZI%6Xz zK4;)X*Qa-x>&zkB_{fPs{J#eUxPfpmiwX(fJ@wGqs_wwl176O!hV^DO1?79z*aMdp zB6$U>gigx}L(dMV2EX7QDQTHBII+%MEh~L#F{fL*b)B)6zE9}3QXkD-A+p0-bW$4# zUIvaAeJ^Ikea$na32sxX^E3Jqr{NLZ9yj$(SC4os!HO88NlRg&=6aH&dJ%gjT}t_f zz|@!G%P7xa9s!Td0QU-d`=SmK28iMP*TlsF{KvWIq~gxhxIYxpj~z=kNNu~o&BILn zmZut9_xQu;<+{l1ELp;7!_Xnm+Jm_?mm^5wWXt`)KPl?*^W54J^JlN}Uwz&pRxk=E+Gi9Fj0ATN0&eFCZ2?Rx zJeKrB3;R@jP~Zhe?tz%VixuE-S#DW(oOYF2x*}#M|Mk}Gd038&?1l`7#0re$#^+Vc z^VU!Og)Jgmn4#;ImjzEW_-pG^eN2vNTxHO-gK0gDI;(a}&b&9PvpoZGmvy`z8*#(& zvhK|YZ%HDm`ZDY5B*7I<=5vB86wP|)@3m2$N>;t;^HA?mh^!7QbDzE(6ea!r9#QPp z+tU=ddq|<)J-IW46~RmIqSH;uj|X^H!UhV#j&ogcaX)=r5bR* zDWOQw+?0&cZMx9a_~>OW4Mg=aSz20Z8#nnDMdp|=+l-~dsY{$bQvPOo36Zq;UG#KC z4rj$5wB#iIkSxVNH9_Nk%kVCnCBqrY2Ddj1%W|Ln+F2krSsC$UCi-!V!7?k?lx_yC z8|oTD`WM-hT@w;DQ$oGN-khNjt7Uvp4*hmEJ&V+H@gm1k;bU6m?pzBK@o!Ypo`U5~ zm=9gIlT^N)5pYm@<5Q$;*Ugc3D^tNm#3e}T-s#W6ogwFYATl~H`RoFpSLRvr4iLzgizpz_c=Kew)oK zYM?b|GU)V4I)Afo+pjkKwtf(>)ractGc*SVim2V^%R*XWsRS}4T{8^+vL1uW)mv&42$!zYu!7G7mpMGykw$U^&-uj9R}<#Bl1S$moSrn)Ni6jU z!g&ShIL#c=N3ST+Z;WS=%Y-kiwlE!)FLuasL0|Ps3l4rTX!4@OYkiQDEVn&LlY-Ti z@b#e7kx|;-BnoTjSq)kQ8)od9}_Dzos(Im(7`PFJsrAN`yr_xTkvEqlYq z-6~Dbz!jy+IX{fGY1}t;Wuy(-ho6JOzde_H(AfJlz%82j>~NayqSp&)s#6Q^h!*b2 zhzCK^SsfNm^=`#~y^|WZ;8+TOea-q? zfA3x8KH`!w)~tQlJ19sL*<(T{JfRGQQ4nnBad4vIun6i2!tWt=WQ%Rmc~*mbChr;S zWFvFmv@2^EE#*f>M9?Q!nL5o5JvylxPzkX|Y+(X@UXlB-G41o#9&=DYhX8#2cD;ks z?4ks)!-ETCjOFx)!mRT3A9`&&I6apzr`eV@Y)>cgtLW(E|EMZ0u0~Za+*Hj$e$$J4 zp)&`PFR)yYT~povdcw?=h39-Iqlnq8T_OwGb3D9M__AzREBB}DK(Bz=M;gq&_Iob) zNM21=$Dh~^ZdnqVEJ*#_na{e(n!ET(qSkqw*WR0>-<<*Rq88P5l!Q*ZcNp;F)opKO4GmOI}QMAgSc4OJmzQ)oDDPeeSHm{}hd^xra5* zzY=g>v^UrH(gA!H0cS>Ab9d0S67bsuF&N-zso)Iw5rUqRl9FUlc{>{$YtXL|@Jcj- z0Eie0_&Hhv{+cid6zA~hXl((m+;H+o^csxSTD4<$9UkDllfm-7#K?5Z$T^wDUA;8rms0;)O z_*nwpo$?T<0^sckfhs{nfV^seZzTk(4uNU_u9Of_U1gZxC)&|r7A_n|#4uM(# zbwQw3K;1xl5I78|(GG}!hYk>^BLwOMfjW~x0iIj|r%8Ynz@ZWl4A3ht2-F(_2jl|v z1tOsD{ty$gUyiN+NC}uxP(?d;4^L2%fCQnSjAeJ+J%!*f;EK8s3WJ2eMZ{1*CBO>- z!0!{xB5&2Bxh>EaH7gKG+qJV?fxd%6Rn09-|dVK5Mn0ssC<0Y%l!f3*ajYC_e` zoq$gLJDy}v9WM({oaZg*e+x)t#|?L9+zsrkJZ(JyR>0=KrSSW1{hw=E#={cyNfkqg z0U|edJ+m_)K)*nS!6!oCF$?H&dcgyj5IbBSpuc-r1$=b@*I+wK8RrX*)({vll;*xT z1|e`H3Xrvqr?rzlz~~=6_(Nn+!+#k2qaVMwP9q2g2K#>z$o_BZK>}ln0>%#oST>@- zg)$5hLna2CSc?E>X9(c_BNBL-0tPg|C-@x9j|9$FaWQbi0?7Ygf$aBlB#;*jU>&~$ z8UWF77$^Wy6cB)l#6%$?BEWB8{i1=!MZf^OD?x~%$$$W?F$@F^*z}^nJQV>Js|c`s zFfbqu#^7@#5C8`Z!1DrErx{<7OJz5(@kQ14RU~4R#HTfq)wiu(_Q;hW<5X z|5UY;02th<0s-_GD24{P0yqbYgVW&maQ%*fAtU?E2(AVc0H}a&z(Ido__*3FTha+W0T*T98SIczWgo&!MP4;gNg;`qywbnAi&^(8zZ1*5TGRi zq!b{)K;lx05CjMm0h$spSHTn+R1NUs-=SdvdF`asA-HSMKt2s%^mk~QpyI@p)dA+| zPD&R7FpR534_FgBGy`A@*h!f~fHjRPVF3Yzj7wPpYv5N3rx|gVX@GVB{fcXc4A5`D z#)-?jvuWcl9soIPAhXfQH6p`lY2&zuL5e04Kq~12RA@0t1fA;s^l_8F48m zV5aS)fG;xq>X!>J&v$5mUWWj8B5^rgL4A&^=U;#T=&GG(;0F(Oa00OB?2vXa0d-H%Io0OJRy$Z*&IcR~d8UezoThtc;y}ID~=&lYi<)u)^Q7MiE$-JL33P=YWm;el7#w`96dUK>H07 z{`tZM)Y2S+Gn$|>h<~yP!~ZiQ@^{An$o7XG2fO);mwy=qd;Av>?EG)8vB1{%ONdyY zgTEw*ML+;7gcBeZKrg?>4+~(J9TEQ90Iu-w%6uVKA_#fCs z(ZwAAWjoO2_lG-x2LJhxg;TQB5#WV(l#luo!t(@%+xkkk$1T(vw@)V4+vzaLj(_jKYGzYa$1%!V2E%C z=CMpCBI%(Zg`dY3qUl+rwb?vRf5&g_!T37StV>izg4H*!H?#Ps*%XfcZ5QF>_Wz~u z=W_VpoeO^~MCkvn6#m$G{&%GS?ySGJ1vwWlz8M|?c(T%6^>&n5B6iz8qN7z~9#ArLSWQd9tj;Dy0>`N^PiU_QX8&7uJ$V*~kz2C541590ib1{W0p5zcO!2z);psFnBR zL&ElxT`+62nO&n-%A5+ zzCAQi(AR7)4F#A!duV9%-|_d<;IIV(8Sw8P*~5?sP)F^dAw~YCi2x@g zd-4Gm#@>E_bMLRRC?s&ov9~N5M5?>nMTvrVZx2oEFZsdH;O&Gx`Ot{J+W@TBzxs<7 z1rIuQw}C-`hYJ$8-#m-K!Tn-)8)D!~?7cJ;2)*~@LyP@Q69KXRo_wMpg5E`g!{C372OO9j zdu0Ha*C2}AQ&tqj;k#*YU?u%UL;l4d9Jr45R~v8<&_>!*A7~Emp#e@Td&eA(5cx}{ z!03Wj&F=b;u)pRmaE|czeuWePj!E{EMZrM(Z+CqtxX52JKq3F$6HzF@WZqL&ROC;& zc)FY0Ia<5pzQw2PY~upKDG)82Q0~`0wNw~SYCvGc;ivk-J88^3r Ih9cSj0kx*rm;e9( From df4ddc2873542e3fd5b874db7d6f2b4f39fc5bfb Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 29 Apr 2020 22:52:11 +0200 Subject: [PATCH 09/34] Add generic_matmul FP16 benchmark --- .../wmma/cudanative-generic-fp16.sh | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) create mode 100644 test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh diff --git a/test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh b/test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh new file mode 100644 index 00000000..46259b38 --- /dev/null +++ b/test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp16.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP16 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp16.csv +done From 62d9b47d697953e0c32b46f814977d6678a18195 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 29 Apr 2020 23:05:23 +0200 Subject: [PATCH 10/34] chmod +x --- test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh diff --git a/test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh b/test/perf/matmul_kernels/wmma/cudanative-generic-fp16.sh old mode 100644 new mode 100755 From d5ae922c60c0348225d028dbc8842c8faef96b05 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sun, 3 May 2020 23:00:44 +0200 Subject: [PATCH 11/34] Add generic plots to legend --- test/perf/matmul_kernels/wmma/plot.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/perf/matmul_kernels/wmma/plot.jl b/test/perf/matmul_kernels/wmma/plot.jl index 85a2c553..300f2006 100644 --- a/test/perf/matmul_kernels/wmma/plot.jl +++ b/test/perf/matmul_kernels/wmma/plot.jl @@ -16,6 +16,8 @@ function plot_results(file, label) end plot_results("cudanative.csv", "CUDAnative") +plot_results("cudanative-generic-fp32.csv", "CUDAnative generic (FP32)") +plot_results("cudanative-generic-fp16.csv", "CUDAnative generic (FP16)") plot_results("cublas.csv", "cuBLAS") plot_results("cutlass-wmma.csv", "CUTLASS (WMMA)") plot_results("cutlass-mma.csv", "CUTLASS (mma.m8n8k4)") From 016c86bd035e5808da89a86744e46ccff768a68a Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 09:05:02 -0400 Subject: [PATCH 12/34] Use linearise in layout --- src/device/matmul_kernels/layout.jl | 14 ++++---------- src/device/tiling.jl | 2 ++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 5a0c4e44..12062df3 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -47,11 +47,8 @@ struct AlignedColMajor{T} <: LayoutBase{T} end @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - base = Tuple(t.base) .+ 1 - @inbounds linear_base = LinearIndices(Base.size(workspace))[base...] - - offset = Tuple(t.offset) .+ 1 - @inbounds linear_offset = LinearIndices(Base.size(workspace))[offset...] + linear_base = linearise(t.base, Base.size(workspace)) + linear_offset = linearise(t.offset, Base.size(workspace)) @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) end @@ -67,11 +64,8 @@ end @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - base = Tuple(t.base) .+ 1 - @inbounds linear_base = LinearIndices(Base.size(workspace))[base...] - - offset = Tuple(t.offset) .+ 1 - @inbounds linear_offset = LinearIndices(Base.size(workspace))[offset...] + linear_base = linearise(t.base, Base.size(workspace)) + linear_offset = linearise(t.offset, Base.size(workspace)) vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index d3d47e0d..1bc52e00 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -112,6 +112,8 @@ tensor with dimensions `dims`. @inbounds return LinearIndices(Tuple(dims))[ind...] end +@inline linearise(coord::NamedTuple{names, T}, dims::Tuple) where {names, T} = linearise(coord, NamedTuple{names}(dims)) + export translate """ From be21dfaa3aebe148c5a6ca8dbc5a5418a5cbf169 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 09:25:28 -0400 Subject: [PATCH 13/34] Reintroduce workspace_size This ensures that the size of the array in global memory is known statically. --- src/device/matmul_kernels/epilogue.jl | 4 ++-- src/device/matmul_kernels/kernel.jl | 12 ++++++------ src/device/matmul_kernels/layout.jl | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/device/matmul_kernels/epilogue.jl b/src/device/matmul_kernels/epilogue.jl index 2f14d6b2..f0f7eaaf 100644 --- a/src/device/matmul_kernels/epilogue.jl +++ b/src/device/matmul_kernels/epilogue.jl @@ -25,9 +25,9 @@ struct Default end # Cooperatively store a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of D from shared to global memory within one threadblock @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) - x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile) + x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile, block_tile.MN.size) x = transform(x, thread_tile) - Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j))) + Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) end end end diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index d25c502c..1642de40 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -30,9 +30,9 @@ function matmul_impl(a, b, c, d, @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) - x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j))) + x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) x = transf_gl2sh_c(x, thread_tile) - Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile) + Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile, block_tile.MN.size) end end @@ -61,18 +61,18 @@ function matmul_impl(a, b, c, d, # (3.1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.K tile of A from global to shared memory within one threadblock @unroll for warp_tile = parallellise(block_tile.MK, Tile(MEM_A_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_A_THREAD), laneId, 32) - x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k))) + x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)), gemm_sz.MK.size) x = transf_gl2sh_a(x, thread_tile) - Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile) + Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile, block_tile.MK.size) end end # (3.2) Cooperatively load a BLOCK_SHAPE.K x BLOCK_SHAPE.N tile of B from global to shared memory within one threadblock @unroll for warp_tile = parallellise(block_tile.KN, Tile(MEM_B_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_B_THREAD), laneId, 32) - x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j))) + x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)), gemm_sz.KN.size) x = transf_gl2sh_b(x, thread_tile) - Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile) + Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile, block_tile.KN.size) end end diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 12062df3..917f564e 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -28,8 +28,8 @@ end @inline eltype(::Type{Padded{L, P}}) where {L, P} = eltype(L) @inline size(::Type{Padded{L, P}}, logical_size::NamedTuple) where {L, P} = size(L, pad_logical_coord(Padded{L, P}, logical_size)) -@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile) -@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile) where {L, P} = store!(L, workspace, value, tile::Tile) +@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, workspace_size::NamedTuple) where {L, P} = load(L, workspace, tile, pad_logical_coord(Padded{L, P}, workspace_size)) +@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile, workspace_size::NamedTuple) where {L, P} = store!(L, workspace, value, tile::Tile, pad_logical_coord(Padded{L, P}, workspace_size)) # --------------- # AlignedColMajor @@ -38,7 +38,7 @@ end struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation -@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} +@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}, workspace_size::NamedTuple) where {T, size} vec_len = 16 ÷ sizeof(T) N = (sizeof(T) * vec_len) ÷ sizeof(Float32) res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) @@ -47,8 +47,8 @@ struct AlignedColMajor{T} <: LayoutBase{T} end @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - linear_base = linearise(t.base, Base.size(workspace)) - linear_offset = linearise(t.offset, Base.size(workspace)) + linear_base = linearise(t.base, workspace_size) + linear_offset = linearise(t.offset, workspace_size) @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) end @@ -57,15 +57,15 @@ struct AlignedColMajor{T} <: LayoutBase{T} end return res end -@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} +@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}, workspace_size::NamedTuple) where {T, size} vec_len = 16 ÷ sizeof(T) @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - linear_base = linearise(t.base, Base.size(workspace)) - linear_offset = linearise(t.offset, Base.size(workspace)) + linear_base = linearise(t.base, workspace_size) + linear_offset = linearise(t.offset, workspace_size) vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) end From 11edd5eae721976034bd2bdbed89ddd7445e8157 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 09:29:14 -0400 Subject: [PATCH 14/34] Use linearise(...) in operator --- src/device/matmul_kernels/operator.jl | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index 809990b0..c745c565 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -27,32 +27,28 @@ struct WMMAOp{M, N, K} end function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = Tuple(tile.index) .+ 1 - @inbounds linear_index = LinearIndices(size(workspace))[ind...] + linear_index = linearise(tile.index, size(workspace)) ptr = pointer(workspace, linear_index) return WMMA.load_a(ptr, size(workspace, 1), WMMA.ColMajor, conf) end function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = Tuple(tile.index) .+ 1 - @inbounds linear_index = LinearIndices(size(workspace))[ind...] + linear_index = linearise(tile.index, size(workspace)) ptr = pointer(workspace, linear_index) return WMMA.load_b(ptr, size(workspace, 1), WMMA.ColMajor, conf) end function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = Tuple(tile.index) .+ 1 - @inbounds linear_index = LinearIndices(size(workspace))[ind...] + linear_index = linearise(tile.index, size(workspace)) ptr = pointer(workspace, linear_index) return WMMA.load_c(ptr, size(workspace, 1), WMMA.ColMajor, conf) end function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = Tuple(tile.index) .+ 1 - @inbounds linear_index = LinearIndices(size(workspace))[ind...] + linear_index = linearise(tile.index, size(workspace)) ptr = pointer(workspace, linear_index) WMMA.store_d(ptr, frag, size(workspace, 1), WMMA.ColMajor, conf) end From 3f64767f7f17b388fde271a85b5ab855ad32f0ae Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 09:56:16 -0400 Subject: [PATCH 15/34] Split translate function --- src/device/matmul_kernels/kernel.jl | 8 ++++---- src/device/tiling.jl | 20 +++++++++++++++++++- 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index 1642de40..2b94cfb6 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -45,7 +45,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_const(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -84,7 +84,7 @@ function matmul_impl(a, b, c, d, a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M - a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + a_tile = translate_const(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end @@ -92,7 +92,7 @@ function matmul_impl(a, b, c, d, b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) @unroll for j = 1 : NUM_FRAGMENTS_N - b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + b_tile = translate_const(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end @@ -114,7 +114,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_const(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 1bc52e00..4133271b 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -119,7 +119,7 @@ export translate """ translate(tile::Tile{names, T}, offset::NamedTuple{names, T}) -Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. +Translate (i.e. move) a [`Tile`](@ref) by an `offset`. # Arguments - `tile`: The [`Tile`](@ref) to translate. @@ -132,6 +132,24 @@ end @inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) +export translate_const + +""" + translate_const(tile::Tile{names, T}, offset::NamedTuple{names, T}) + +Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. + +# Arguments +- `tile`: The [`Tile`](@ref) to translate. +- `offset`: The `offset` in each dimension. +""" +@inline function translate_const(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} + offset = map(+, tile.offset, offset) + return Tile{size, names, T}(tile.base, offset) +end + +@inline translate_const(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate_const(tile, NamedTuple{names}(offset)) + # ------------- # TileIterators # ------------- From 2acd20b1f74fa77421f67cdcd827ef35d1619584 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 10:31:35 -0400 Subject: [PATCH 16/34] Revert "Split translate function" This reverts commit 3f64767f7f17b388fde271a85b5ab855ad32f0ae. --- src/device/matmul_kernels/kernel.jl | 8 ++++---- src/device/tiling.jl | 20 +------------------- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index 2b94cfb6..1642de40 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -45,7 +45,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate_const(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -84,7 +84,7 @@ function matmul_impl(a, b, c, d, a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M - a_tile = translate_const(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end @@ -92,7 +92,7 @@ function matmul_impl(a, b, c, d, b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) @unroll for j = 1 : NUM_FRAGMENTS_N - b_tile = translate_const(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end @@ -114,7 +114,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate_const(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 4133271b..1bc52e00 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -119,7 +119,7 @@ export translate """ translate(tile::Tile{names, T}, offset::NamedTuple{names, T}) -Translate (i.e. move) a [`Tile`](@ref) by an `offset`. +Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. # Arguments - `tile`: The [`Tile`](@ref) to translate. @@ -132,24 +132,6 @@ end @inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) -export translate_const - -""" - translate_const(tile::Tile{names, T}, offset::NamedTuple{names, T}) - -Translate (i.e. move) a [`Tile`](@ref) by a constant `offset`. - -# Arguments -- `tile`: The [`Tile`](@ref) to translate. -- `offset`: The `offset` in each dimension. -""" -@inline function translate_const(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} - offset = map(+, tile.offset, offset) - return Tile{size, names, T}(tile.base, offset) -end - -@inline translate_const(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate_const(tile, NamedTuple{names}(offset)) - # ------------- # TileIterators # ------------- From 968d72c4055e2d7d24faa355e6271d477976dfb7 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Mon, 25 May 2020 10:39:24 -0400 Subject: [PATCH 17/34] Revert "Reintroduce workspace_size" This reverts commit be21dfaa3aebe148c5a6ca8dbc5a5418a5cbf169. --- src/device/matmul_kernels/epilogue.jl | 4 ++-- src/device/matmul_kernels/kernel.jl | 12 ++++++------ src/device/matmul_kernels/layout.jl | 16 ++++++++-------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/device/matmul_kernels/epilogue.jl b/src/device/matmul_kernels/epilogue.jl index f0f7eaaf..2f14d6b2 100644 --- a/src/device/matmul_kernels/epilogue.jl +++ b/src/device/matmul_kernels/epilogue.jl @@ -25,9 +25,9 @@ struct Default end # Cooperatively store a BLOCK_SHAPE.M x BLOCK_SHAPE.N tile of D from shared to global memory within one threadblock @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) - x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile, block_tile.MN.size) + x = Layout.load(SHARED_D_LAYOUT, shmem_d, thread_tile) x = transform(x, thread_tile) - Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + Layout.store!(GLOBAL_D_LAYOUT, d, x, translate(thread_tile, (M = block_i, N = block_j))) end end end diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index 1642de40..d25c502c 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -30,9 +30,9 @@ function matmul_impl(a, b, c, d, @unroll for warp_tile = parallellise(block_tile.MN, Tile(MEM_CD_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_CD_THREAD), laneId, 32) - x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j)), gemm_sz.MN.size) + x = Layout.load(GLOBAL_C_LAYOUT, c, translate(thread_tile, (M = block_i, N = block_j))) x = transf_gl2sh_c(x, thread_tile) - Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile, block_tile.MN.size) + Layout.store!(SHARED_C_LAYOUT, shmem_c, x, thread_tile) end end @@ -61,18 +61,18 @@ function matmul_impl(a, b, c, d, # (3.1) Cooperatively load a BLOCK_SHAPE.M x BLOCK_SHAPE.K tile of A from global to shared memory within one threadblock @unroll for warp_tile = parallellise(block_tile.MK, Tile(MEM_A_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_A_THREAD), laneId, 32) - x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k)), gemm_sz.MK.size) + x = Layout.load(GLOBAL_A_LAYOUT, a, translate(thread_tile, (M = block_i, K = block_k))) x = transf_gl2sh_a(x, thread_tile) - Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile, block_tile.MK.size) + Layout.store!(SHARED_A_LAYOUT, shmem_a, x, thread_tile) end end # (3.2) Cooperatively load a BLOCK_SHAPE.K x BLOCK_SHAPE.N tile of B from global to shared memory within one threadblock @unroll for warp_tile = parallellise(block_tile.KN, Tile(MEM_B_WARP), warpId, WARPS_PER_BLOCK) @unroll for thread_tile = parallellise(warp_tile, Tile(MEM_B_THREAD), laneId, 32) - x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j)), gemm_sz.KN.size) + x = Layout.load(GLOBAL_B_LAYOUT, b, translate(thread_tile, (K = block_k, N = block_j))) x = transf_gl2sh_b(x, thread_tile) - Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile, block_tile.KN.size) + Layout.store!(SHARED_B_LAYOUT, shmem_b, x, thread_tile) end end diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 917f564e..12062df3 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -28,8 +28,8 @@ end @inline eltype(::Type{Padded{L, P}}) where {L, P} = eltype(L) @inline size(::Type{Padded{L, P}}, logical_size::NamedTuple) where {L, P} = size(L, pad_logical_coord(Padded{L, P}, logical_size)) -@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, workspace_size::NamedTuple) where {L, P} = load(L, workspace, tile, pad_logical_coord(Padded{L, P}, workspace_size)) -@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile, workspace_size::NamedTuple) where {L, P} = store!(L, workspace, value, tile::Tile, pad_logical_coord(Padded{L, P}, workspace_size)) +@inline load(::Type{Padded{L, P}}, workspace, tile::Tile, logical_size::NamedTuple) where {L, P} = load(L, workspace, tile) +@inline store!(::Type{Padded{L, P}}, workspace, value, tile::Tile) where {L, P} = store!(L, workspace, value, tile::Tile) # --------------- # AlignedColMajor @@ -38,7 +38,7 @@ end struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation -@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}, workspace_size::NamedTuple) where {T, size} +@inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} vec_len = 16 ÷ sizeof(T) N = (sizeof(T) * vec_len) ÷ sizeof(Float32) res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) @@ -47,8 +47,8 @@ struct AlignedColMajor{T} <: LayoutBase{T} end @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - linear_base = linearise(t.base, workspace_size) - linear_offset = linearise(t.offset, workspace_size) + linear_base = linearise(t.base, Base.size(workspace)) + linear_offset = linearise(t.offset, Base.size(workspace)) @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) end @@ -57,15 +57,15 @@ struct AlignedColMajor{T} <: LayoutBase{T} end return res end -@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}, workspace_size::NamedTuple) where {T, size} +@inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} vec_len = 16 ÷ sizeof(T) @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) - linear_base = linearise(t.base, workspace_size) - linear_offset = linearise(t.offset, workspace_size) + linear_base = linearise(t.base, Base.size(workspace)) + linear_offset = linearise(t.offset, Base.size(workspace)) vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) end From ee13ad86d978a05c1ecd34ede307992907957415 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 27 May 2020 05:13:36 -0400 Subject: [PATCH 18/34] Add components for complex matmul --- src/device/matmul_kernels/layout.jl | 66 ++++++++++++++++++++++++ src/device/matmul_kernels/operator.jl | 74 +++++++++++++++++++++++++++ 2 files changed, 140 insertions(+) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 12062df3..59193139 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -72,4 +72,70 @@ end end end +# ------------------ +# InterleavedComplex +# ------------------ + +struct InterleavedComplex{T} <: LayoutBase{T} end + +@inline function load(::Type{InterleavedComplex{T}}, workspace, tile::Tile{size}) where {T, size} + res = MArray{Tuple{tile.size[1], tile.size[2]}, Complex{T}}(undef) + + @unroll for j = 1 : tile.size[2] + @unroll for i = 1 : tile.size[1] + t = translate(tile, (i - 1, j - 1)) + + @inbounds res[i, j] = workspace[t.index[1] + 1, t.index[2] + 1] + end + end + + return res +end + +@inline function store!(::Type{InterleavedComplex{T}}, workspace, value, tile::Tile{size}) where {T, size} + @unroll for j = 1 : size[2] + @unroll for i = 1 : size[1] + t = translate(tile, (i - 1, j - 1)) + + @inbounds workspace[t.index[1] + 1, t.index[2] + 1] = value[i, j] + end + end +end + +# ------------ +# SplitComplex +# ------------ + +struct SplitComplex{T} <: LayoutBase{T} end + +@inline function size(::Type{SplitComplex{T}}, logical_size::NamedTuple) where {T} + t = Tuple(logical_size) + return (t..., 2) +end + +@inline function load(::Type{SplitComplex{T}}, workspace, tile::Tile{size}) where {T, size} + res = MArray{Tuple{tile.size[1], tile.size[2]}, Complex{T}}(undef) + + @unroll for j = 1 : tile.size[2] + @unroll for i = 1 : tile.size[1] + t = translate(tile, (i - 1, j - 1)) + + @inbounds res[i,j] = workspace[t.index[1] + 1, t.index[2] + 1, 1] + workspace[t.index[1] + 1, t.index[2] + 1, 2] * im + end + end + + return res +end + +@inline function store!(::Type{SplitComplex{T}}, workspace, value, tile::Tile{size}) where {T, size} + @unroll for j = 1 : tile.size[2] + @unroll for i = 1 : tile.size[1] + t = translate(tile, (i - 1, j - 1)) + + @inbounds workspace[t.index[1] + 1, t.index[2] + 1, 1] = value[i, j].re + @inbounds workspace[t.index[1] + 1, t.index[2] + 1, 2] = value[i, j].im + end + end +end + end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index c745c565..3f46ddac 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -58,4 +58,78 @@ function mma(::Type{WMMAOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} return WMMA.mma(a_frag, b_frag, c_frag, conf) end +# ----------- +# WMMAComplex +# ----------- + +struct WMMAComplexOp{M, N, K} end + +@inline shape(::Type{WMMAComplexOp{M, N, K}}) where {M, N, K} = (M = M, N = N, K = K) + +@inline fragtype_a(::Type{WMMAComplexOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float16}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixA}} +@inline fragtype_b(::Type{WMMAComplexOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float16}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB}} +@inline fragtype_accum(::Type{WMMAComplexOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float32}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator}} + +@inline function load_a(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_a(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_a(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function load_b(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_b(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_b(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function load_c(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_c(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_c(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function store_d(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, frag, tile::Tile) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + WMMA.store_d(pointer(workspace, ind), frag[1], size(workspace)[1], WMMA.ColMajor, conf) + WMMA.store_d(pointer(workspace, ind + prod(size(workspace)[1:2])), frag[2], size(workspace)[1], WMMA.ColMajor, conf) +end + +using LLVM + +multiply_fp16(a::Float16, b::Float16) = + Base.bitcast(Float16, + LLVM.Interop.@asmcall( + "{mul.f16 \$0,\$1,\$2;}", + "=h,h,h", + false, + Int16, + Tuple{Int16, Int16}, + Base.bitcast(Int16, a), + Base.bitcast(Int16, b) + ) + ) + +@inline function mma(::Type{WMMAComplexOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + + c_re = c_frag[1] + c_im = c_frag[2] + + c_re = WMMA.mma(a_frag[1], b_frag[1], c_re, conf) + c_re = WMMA.mma(multiply_fp16.(a_frag[2], Float16(-1)), b_frag[2], c_re, conf) + + c_im = WMMA.mma(a_frag[1], b_frag[2], c_im, conf) + c_im = WMMA.mma(a_frag[2], b_frag[1], c_im, conf) + + return (c_re, c_im) +end + end From c709daf53a53b905f78d1e08778e9af29c30b176 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 27 May 2020 05:16:25 -0400 Subject: [PATCH 19/34] Add test for complex matmul --- test/device/matmul_kernels.jl | 49 +++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/test/device/matmul_kernels.jl b/test/device/matmul_kernels.jl index f3a2d085..6d5a4a00 100644 --- a/test/device/matmul_kernels.jl +++ b/test/device/matmul_kernels.jl @@ -59,6 +59,55 @@ using CUDAnative.MatMul @test all(isapprox.(Float32.(a_h) * Float32.(b_h) + alpha * c_h, Array(d); rtol = sqrt(eps(Float16)))) end end + + @testset "WMMA Complex GEMM" begin + @testset "(M = $M, N = $N, K = $K)" for M in [128, 256, 1024, 2048], + N in [128, 256, 1024, 2048], + K in [128, 256, 1024, 2048] + + a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); + b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); + c_h = rand(Complex{Float32}, (M, N)); + + a = CuArray(a_h); + b = CuArray(b_h); + c = CuArray(c_h); + d = similar(c); + + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMAComplexOp{16, 16, 16}, + + global_a_layout = Layout.InterleavedComplex{Float16}, + global_b_layout = Layout.InterleavedComplex{Float16}, + global_c_layout = Layout.InterleavedComplex{Float32}, + global_d_layout = Layout.InterleavedComplex{Float32}, + + shared_a_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_b_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_c_layout = Layout.SplitComplex{Float32}, + shared_d_layout = Layout.SplitComplex{Float32}, + + warps_per_block = 8, + + compute_warp = (M = 16, N = 32, K = 16), + + block_shape = (M = 64, N = 64, K = 32), + + mem_a_warp = (M = 64, K = 2), + mem_b_warp = (K = 32, N = 4), + mem_cd_warp = (M = 64, N = 1), + + mem_a_thread = (M = 4, K = 1), + mem_b_thread = (K = 4, N = 1), + mem_cd_thread = (M = 2, N = 1) + ) + + MatMul.matmul(a, b, c, d, conf) + + @test all(isapprox.(a_h * b_h + c_h, Array(d); rtol=sqrt(eps(Float16)))); + end + end end ################################################################################ From e9b9c5c900862487b1f612052253f94225bdfa0d Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 27 May 2020 05:35:32 -0400 Subject: [PATCH 20/34] Reduce test set size --- test/device/matmul_kernels.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/device/matmul_kernels.jl b/test/device/matmul_kernels.jl index 6d5a4a00..554a024f 100644 --- a/test/device/matmul_kernels.jl +++ b/test/device/matmul_kernels.jl @@ -61,9 +61,9 @@ using CUDAnative.MatMul end @testset "WMMA Complex GEMM" begin - @testset "(M = $M, N = $N, K = $K)" for M in [128, 256, 1024, 2048], - N in [128, 256, 1024, 2048], - K in [128, 256, 1024, 2048] + @testset "(M = $M, N = $N, K = $K)" for M in [128, 256], + N in [128, 256], + K in [128, 256] a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); From a0d1d285a08cdc13cfc4c1f7750330f6eed4109f Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 27 May 2020 06:16:32 -0400 Subject: [PATCH 21/34] Add scripts to profile complex CUTLASS --- .../matmul_kernels/complex-wmma/.gitignore | 2 + .../matmul_kernels/complex-wmma/cudanative.jl | 60 +++++++++++++++++++ .../matmul_kernels/complex-wmma/cudanative.sh | 22 +++++++ .../matmul_kernels/complex-wmma/cutlass.sh | 22 +++++++ test/perf/matmul_kernels/complex-wmma/plot.jl | 24 ++++++++ 5 files changed, 130 insertions(+) create mode 100644 test/perf/matmul_kernels/complex-wmma/.gitignore create mode 100644 test/perf/matmul_kernels/complex-wmma/cudanative.jl create mode 100755 test/perf/matmul_kernels/complex-wmma/cudanative.sh create mode 100755 test/perf/matmul_kernels/complex-wmma/cutlass.sh create mode 100644 test/perf/matmul_kernels/complex-wmma/plot.jl diff --git a/test/perf/matmul_kernels/complex-wmma/.gitignore b/test/perf/matmul_kernels/complex-wmma/.gitignore new file mode 100644 index 00000000..5919a65f --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.pdf diff --git a/test/perf/matmul_kernels/complex-wmma/cudanative.jl b/test/perf/matmul_kernels/complex-wmma/cudanative.jl new file mode 100644 index 00000000..d628c23c --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cudanative.jl @@ -0,0 +1,60 @@ +using CUDAdrv +using CUDAnative +using CUDAnative.MatMul +using CuArrays + +M = parse(Int, ARGS[1]) +N = parse(Int, ARGS[2]) +K = parse(Int, ARGS[3]) + +function benchmark_matmul(a, b, c, d) + CuArrays.@sync begin + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMAComplexOp{16, 16, 16}, + + global_a_layout = Layout.InterleavedComplex{Float16}, + global_b_layout = Layout.InterleavedComplex{Float16}, + global_c_layout = Layout.InterleavedComplex{Float32}, + global_d_layout = Layout.InterleavedComplex{Float32}, + + shared_a_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_b_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_c_layout = Layout.SplitComplex{Float32}, + shared_d_layout = Layout.SplitComplex{Float32}, + + warps_per_block = 8, + + compute_warp = (M = 16, N = 32, K = 16), + + block_shape = (M = 64, N = 64, K = 32), + + mem_a_warp = (M = 64, K = 2), + mem_b_warp = (K = 32, N = 4), + mem_cd_warp = (M = 64, N = 1), + + mem_a_thread = (M = 4, K = 1), + mem_b_thread = (K = 4, N = 1), + mem_cd_thread = (M = 2, N = 1) + ) + + MatMul.matmul(a, b, c, d, conf) + end +end + +a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); +b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); +c_h = rand(Complex{Float32}, (M, N)); + +a = CuArray(a_h); +b = CuArray(b_h); +c = CuArray(c_h); +d = similar(c); + +# warmup +benchmark_matmul(a, b, c, d) + +# profile +for i = 1 : 10 + CUDAdrv.@profile benchmark_matmul(a, b, c, d) +end diff --git a/test/perf/matmul_kernels/complex-wmma/cudanative.sh b/test/perf/matmul_kernels/complex-wmma/cudanative.sh new file mode 100755 index 00000000..f9b5d7ab --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cudanative.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative.csv +done diff --git a/test/perf/matmul_kernels/complex-wmma/cutlass.sh b/test/perf/matmul_kernels/complex-wmma/cutlass.sh new file mode 100755 index 00000000..a1ed319d --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cutlass.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +CUTLASS_BUILD_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cutlass.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(nv-nsight-cu-cli -f --summary per-kernel --csv --units base -k Kernel ${CUTLASS_BUILD_PATH}/examples/10_planar_complex/10_planar_complex --m=$N --n=$N --k=$N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cutlass.csv +done diff --git a/test/perf/matmul_kernels/complex-wmma/plot.jl b/test/perf/matmul_kernels/complex-wmma/plot.jl new file mode 100644 index 00000000..c274b798 --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/plot.jl @@ -0,0 +1,24 @@ +using CSV +using DataFrames +using Plots + +pyplot() + +function plot_results(file, label) + df = DataFrame(CSV.File(file)) + + N = df[!, :N] + mean_runtime = df[!, :runtime] .* 1e3 # in ps + + tflops = (8 .* N .^ 3) ./ mean_runtime + + plot!(N, tflops, label=label, xscale=:log2, markershape=:circle) +end + +plot_results("cudanative.csv", "CUDAnative") +plot_results("cutlass.csv", "CUTLASS Example") + +title!("Performance of mixed-precision complex GEMM\nProblem size: N x N x N") +xlabel!("N") +ylabel!("TFLOPS") +savefig("plot.pdf") From 8ce4dab5f57b7599a207bfe6462230f83b9b2473 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Wed, 27 May 2020 08:24:27 -0400 Subject: [PATCH 22/34] Add dual op --- src/device/matmul_kernels/operator.jl | 58 +++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index 3f46ddac..65724637 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -132,4 +132,62 @@ multiply_fp16(a::Float16, b::Float16) = return (c_re, c_im) end +# -------- +# WMMADual +# -------- + +struct WMMADualOp{M, N, K} end + +@inline shape(::Type{WMMADualOp{M, N, K}}) where {M, N, K} = (M = M, N = N, K = K) + +@inline fragtype_a(::Type{WMMADualOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float16}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixA}} +@inline fragtype_b(::Type{WMMADualOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float16}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB}} +@inline fragtype_accum(::Type{WMMADualOp{16, 16, 16}}, ::Type{Layout.SplitComplex{Float32}}) = NTuple{2, WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator}} + +@inline function load_a(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_a(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_a(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function load_b(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_b(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_b(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function load_c(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, tile::Tile) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + return (WMMA.load_c(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), + WMMA.load_c(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) +end + +@inline function store_d(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, frag, tile::Tile) where {M, N, K} + conf = WMMA.Config{M, N, K, Float32} + ind = linearise(tile.index, size(workspace)[1:2]) + + WMMA.store_d(pointer(workspace, ind), frag[1], size(workspace)[1], WMMA.ColMajor, conf) + WMMA.store_d(pointer(workspace, ind + prod(size(workspace)[1:2])), frag[2], size(workspace)[1], WMMA.ColMajor, conf) +end + +@inline function mma(::Type{WMMADualOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} + conf = WMMA.Config{16, 16, 16, Float32} + + c_re = c_frag[1] + c_du = c_frag[2] + + c_re = WMMA.mma(a_frag[1], b_frag[1], c_re, conf) + + c_du = WMMA.mma(a_frag[1], b_frag[2], c_du, conf) + c_du = WMMA.mma(a_frag[2], b_frag[1], c_du, conf) + + return (c_re, c_du) +end + end From 340e79168f98b4f98ec45477df74ea2ce7a00ccf Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Fri, 29 May 2020 14:05:03 -0400 Subject: [PATCH 23/34] Add translate variant for offset --- src/device/matmul_kernels/kernel.jl | 8 +++--- src/device/matmul_kernels/operator.jl | 36 ++++++++++++++++++--------- src/device/tiling.jl | 9 +++++++ 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index d25c502c..baab4e19 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -45,7 +45,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -84,7 +84,7 @@ function matmul_impl(a, b, c, d, a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M - a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + a_tile = translate_offset(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end @@ -92,7 +92,7 @@ function matmul_impl(a, b, c, d, b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) @unroll for j = 1 : NUM_FRAGMENTS_N - b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end @@ -114,7 +114,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index 65724637..c4eabc3e 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -25,31 +25,43 @@ struct WMMAOp{M, N, K} end @inline fragtype_b(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB} @inline fragtype_accum(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float32}}) = WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator} -function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +@inline function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) return WMMA.load_a(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +@inline function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) return WMMA.load_b(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} +@inline function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) return WMMA.load_c(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} +@inline function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) WMMA.store_d(ptr, frag, size(workspace, 1), WMMA.ColMajor, conf) end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 1bc52e00..891a6315 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -132,6 +132,15 @@ end @inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) +export translate_offset + +@inline function translate_offset(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} + new_offset = map(+, tile.offset, offset) + return Tile{size, names, T}(tile.base, new_offset) +end + +@inline translate_offset(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate_offset(tile, NamedTuple{names}(offset)) + # ------------- # TileIterators # ------------- From 1ca56c4f66560de9b9a2881b0c08e25bef9e92b0 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Fri, 29 May 2020 14:17:08 -0400 Subject: [PATCH 24/34] Revert "Add translate variant for offset" This reverts commit 340e79168f98b4f98ec45477df74ea2ce7a00ccf. --- src/device/matmul_kernels/kernel.jl | 8 +++--- src/device/matmul_kernels/operator.jl | 36 +++++++++------------------ src/device/tiling.jl | 9 ------- 3 files changed, 16 insertions(+), 37 deletions(-) diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index baab4e19..d25c502c 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -45,7 +45,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -84,7 +84,7 @@ function matmul_impl(a, b, c, d, a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M - a_tile = translate_offset(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end @@ -92,7 +92,7 @@ function matmul_impl(a, b, c, d, b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) @unroll for j = 1 : NUM_FRAGMENTS_N - b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end @@ -114,7 +114,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index c4eabc3e..65724637 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -25,43 +25,31 @@ struct WMMAOp{M, N, K} end @inline fragtype_b(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB} @inline fragtype_accum(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float32}}) = WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator} -@inline function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - - linear_base = linearise(tile.base, size(workspace)) - linear_offset = linearise(tile.offset, size(workspace)) - - ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) + linear_index = linearise(tile.index, size(workspace)) + ptr = pointer(workspace, linear_index) return WMMA.load_a(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -@inline function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - - linear_base = linearise(tile.base, size(workspace)) - linear_offset = linearise(tile.offset, size(workspace)) - - ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) + linear_index = linearise(tile.index, size(workspace)) + ptr = pointer(workspace, linear_index) return WMMA.load_b(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -@inline function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} +function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - - linear_base = linearise(tile.base, size(workspace)) - linear_offset = linearise(tile.offset, size(workspace)) - - ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) + linear_index = linearise(tile.index, size(workspace)) + ptr = pointer(workspace, linear_index) return WMMA.load_c(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -@inline function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} +function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - - linear_base = linearise(tile.base, size(workspace)) - linear_offset = linearise(tile.offset, size(workspace)) - - ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) + linear_index = linearise(tile.index, size(workspace)) + ptr = pointer(workspace, linear_index) WMMA.store_d(ptr, frag, size(workspace, 1), WMMA.ColMajor, conf) end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 891a6315..1bc52e00 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -132,15 +132,6 @@ end @inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) -export translate_offset - -@inline function translate_offset(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} - new_offset = map(+, tile.offset, offset) - return Tile{size, names, T}(tile.base, new_offset) -end - -@inline translate_offset(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate_offset(tile, NamedTuple{names}(offset)) - # ------------- # TileIterators # ------------- From b33cbce03992c5cbc0f69a05c725de62cf9546f3 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Fri, 29 May 2020 16:18:29 -0400 Subject: [PATCH 25/34] Remove explicit vectorisation --- src/device/matmul_kernels/layout.jl | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 59193139..381eac30 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -39,18 +39,16 @@ struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation @inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} - vec_len = 16 ÷ sizeof(T) - N = (sizeof(T) * vec_len) ÷ sizeof(Float32) - res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) + res = MArray{Tuple{size[1], size[2]}, T}(undef) @unroll for j = 1 : size[2] - @unroll for i = 1 : vec_len : size[1] + @unroll for i = 1 : size[1] t = translate(tile, (i - 1, j - 1)) linear_base = linearise(t.base, Base.size(workspace)) linear_offset = linearise(t.offset, Base.size(workspace)) - @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) + @inbounds res[i, j] = workspace[linear_base + linear_offset - 1] end end @@ -58,16 +56,14 @@ struct AlignedColMajor{T} <: LayoutBase{T} end end @inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} - vec_len = 16 ÷ sizeof(T) - @unroll for j = 1 : size[2] - @unroll for i = 1 : vec_len : size[1] + @unroll for i = 1 : size[1] t = translate(tile, (i - 1, j - 1)) linear_base = linearise(t.base, Base.size(workspace)) linear_offset = linearise(t.offset, Base.size(workspace)) - vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) + @inbounds workspace[linear_base + linear_offset - 1] = value[i,j] end end end From 2298f57baec6217fd26aa4311d9e46be8d660b11 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 05:21:07 -0400 Subject: [PATCH 26/34] Revert "Remove explicit vectorisation" This reverts commit b33cbce03992c5cbc0f69a05c725de62cf9546f3. --- src/device/matmul_kernels/layout.jl | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 381eac30..59193139 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -39,16 +39,18 @@ struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation @inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} - res = MArray{Tuple{size[1], size[2]}, T}(undef) + vec_len = 16 ÷ sizeof(T) + N = (sizeof(T) * vec_len) ÷ sizeof(Float32) + res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) @unroll for j = 1 : size[2] - @unroll for i = 1 : size[1] + @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) linear_base = linearise(t.base, Base.size(workspace)) linear_offset = linearise(t.offset, Base.size(workspace)) - @inbounds res[i, j] = workspace[linear_base + linear_offset - 1] + @inbounds res[i, j] = vloada(Vec{vec_len, T}, pointer(workspace, linear_base), linear_offset) end end @@ -56,14 +58,16 @@ struct AlignedColMajor{T} <: LayoutBase{T} end end @inline function store!(::Type{AlignedColMajor{T}}, workspace, value, tile::Tile{size}) where {T, size} + vec_len = 16 ÷ sizeof(T) + @unroll for j = 1 : size[2] - @unroll for i = 1 : size[1] + @unroll for i = 1 : vec_len : size[1] t = translate(tile, (i - 1, j - 1)) linear_base = linearise(t.base, Base.size(workspace)) linear_offset = linearise(t.offset, Base.size(workspace)) - @inbounds workspace[linear_base + linear_offset - 1] = value[i,j] + vstorea!(Vec{vec_len, T}, pointer(workspace, linear_base), value[i, j], linear_offset) end end end From c93cf4d662cb53d5c995c947f0d7b09bee7f0293 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 08:40:46 -0400 Subject: [PATCH 27/34] Fix vectorisation --- src/device/matmul_kernels/layout.jl | 3 +-- src/device/pointer.jl | 34 ++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index 59193139..c638cbde 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -40,8 +40,7 @@ struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation @inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} vec_len = 16 ÷ sizeof(T) - N = (sizeof(T) * vec_len) ÷ sizeof(Float32) - res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) + res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{vec_len, VecElement{T}}}(undef) @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] diff --git a/src/device/pointer.jl b/src/device/pointer.jl index d335dab7..8f3528b1 100644 --- a/src/device/pointer.jl +++ b/src/device/pointer.jl @@ -261,23 +261,37 @@ export Vec struct Vec{N, T} end export vloada -@inline @generated function vloada(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, i::Integer = 1) where {N, T, AS} - alignment = sizeof(T) * N - vec_len = (sizeof(T) * N) ÷ sizeof(Float32) +@inline @generated function vloada(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, i::Integer=1) where {N, T, AS} + jl_ty = convert(LLVMType, T) + as = convert(Int, AS) + + ir = "%ptr = inttoptr i64 %0 to $jl_ty addrspace($as)* + %gep = getelementptr $jl_ty, $jl_ty addrspace($as)* %ptr, i64 %1 + + %vecptr = bitcast $jl_ty addrspace($as)* %gep to <$N x $jl_ty> addrspace($as)* + %val = load <$N x $jl_ty>, <$N x $jl_ty> addrspace($as)* %vecptr, align 16 + + ret <$N x $jl_ty> %val" return quote - vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) - return unsafe_load(vec_ptr, (i - 1) ÷ N + 1, Val($alignment)) + Base.llvmcall($ir, NTuple{N, VecElement{T}}, Tuple{CUDAnative.DevicePtr{T, AS}, Int64}, ptr, i - 1) end end export vstorea! -@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, x, i::Integer = 1) where {N, T, AS} - alignment = sizeof(T) * N - vec_len = (sizeof(T) * N) ÷ sizeof(Float32) +@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, x, i::Integer=1) where {N, T, AS} + jl_ty = convert(LLVMType, T) + as = convert(Int, AS) + + ir = "%ptr = inttoptr i64 %0 to $jl_ty addrspace($as)* + %gep = getelementptr $jl_ty, $jl_ty addrspace($as)* %ptr, i64 %1 + + %vecptr = bitcast $jl_ty addrspace($as)* %gep to <$N x $jl_ty> addrspace($as)* + store <$N x $jl_ty> %2, <$N x $jl_ty> addrspace($as)* %vecptr, align 16 + + ret void" return quote - vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) - unsafe_store!(vec_ptr, x, (i - 1) ÷ N + 1, Val($alignment)) + Base.llvmcall($ir, Nothing, Tuple{CUDAnative.DevicePtr{T, AS}, Int64, NTuple{N, VecElement{T}}}, ptr, i - 1, x) end end From 47710e28eb5488d38439a58a124d38bae9fc8af1 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 08:53:32 -0400 Subject: [PATCH 28/34] Revert "Fix vectorisation" This reverts commit c93cf4d662cb53d5c995c947f0d7b09bee7f0293. --- src/device/matmul_kernels/layout.jl | 3 ++- src/device/pointer.jl | 34 +++++++++-------------------- 2 files changed, 12 insertions(+), 25 deletions(-) diff --git a/src/device/matmul_kernels/layout.jl b/src/device/matmul_kernels/layout.jl index c638cbde..59193139 100644 --- a/src/device/matmul_kernels/layout.jl +++ b/src/device/matmul_kernels/layout.jl @@ -40,7 +40,8 @@ struct AlignedColMajor{T} <: LayoutBase{T} end # TODO: cleanup vectorisation @inline function load(::Type{AlignedColMajor{T}}, workspace, tile::Tile{size}) where {T, size} vec_len = 16 ÷ sizeof(T) - res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{vec_len, VecElement{T}}}(undef) + N = (sizeof(T) * vec_len) ÷ sizeof(Float32) + res = MArray{Tuple{size[1] ÷ vec_len, size[2]}, NTuple{N, VecElement{Float32}}}(undef) @unroll for j = 1 : size[2] @unroll for i = 1 : vec_len : size[1] diff --git a/src/device/pointer.jl b/src/device/pointer.jl index 8f3528b1..d335dab7 100644 --- a/src/device/pointer.jl +++ b/src/device/pointer.jl @@ -261,37 +261,23 @@ export Vec struct Vec{N, T} end export vloada -@inline @generated function vloada(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, i::Integer=1) where {N, T, AS} - jl_ty = convert(LLVMType, T) - as = convert(Int, AS) - - ir = "%ptr = inttoptr i64 %0 to $jl_ty addrspace($as)* - %gep = getelementptr $jl_ty, $jl_ty addrspace($as)* %ptr, i64 %1 - - %vecptr = bitcast $jl_ty addrspace($as)* %gep to <$N x $jl_ty> addrspace($as)* - %val = load <$N x $jl_ty>, <$N x $jl_ty> addrspace($as)* %vecptr, align 16 - - ret <$N x $jl_ty> %val" +@inline @generated function vloada(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, i::Integer = 1) where {N, T, AS} + alignment = sizeof(T) * N + vec_len = (sizeof(T) * N) ÷ sizeof(Float32) return quote - Base.llvmcall($ir, NTuple{N, VecElement{T}}, Tuple{CUDAnative.DevicePtr{T, AS}, Int64}, ptr, i - 1) + vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) + return unsafe_load(vec_ptr, (i - 1) ÷ N + 1, Val($alignment)) end end export vstorea! -@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, x, i::Integer=1) where {N, T, AS} - jl_ty = convert(LLVMType, T) - as = convert(Int, AS) - - ir = "%ptr = inttoptr i64 %0 to $jl_ty addrspace($as)* - %gep = getelementptr $jl_ty, $jl_ty addrspace($as)* %ptr, i64 %1 - - %vecptr = bitcast $jl_ty addrspace($as)* %gep to <$N x $jl_ty> addrspace($as)* - store <$N x $jl_ty> %2, <$N x $jl_ty> addrspace($as)* %vecptr, align 16 - - ret void" +@inline @generated function vstorea!(::Type{Vec{N, T}}, ptr::CUDAnative.DevicePtr{T, AS}, x, i::Integer = 1) where {N, T, AS} + alignment = sizeof(T) * N + vec_len = (sizeof(T) * N) ÷ sizeof(Float32) return quote - Base.llvmcall($ir, Nothing, Tuple{CUDAnative.DevicePtr{T, AS}, Int64, NTuple{N, VecElement{T}}}, ptr, i - 1, x) + vec_ptr = convert(CUDAnative.DevicePtr{NTuple{$vec_len, VecElement{Float32}}, AS}, ptr) + unsafe_store!(vec_ptr, x, (i - 1) ÷ N + 1, Val($alignment)) end end From 356ff584f2262fac353153d238dd67c333c79121 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 09:18:03 -0400 Subject: [PATCH 29/34] Revert "Merge branch 'master' into tf/matmul-kernel" This reverts commit f3325fd126ebb1ffb753ed394bfd421a2252c342, reversing changes made to d5ae922c60c0348225d028dbc8842c8faef96b05. --- .gitlab-ci.yml | 5 +- Manifest.toml | 65 +++++++++++++--------- Project.toml | 5 +- docs/Manifest.toml | 11 ++-- res/Manifest.toml | 36 ++++++++----- src/CUDAnative.jl | 35 +++++++----- src/compiler.jl | 27 ---------- src/device/array.jl | 3 +- src/device/cuda.jl | 2 - src/device/cuda/memory_shared.jl | 6 ++- src/device/cuda/output.jl | 20 ------- src/device/cuda/wmma.jl | 25 ++++++--- src/device/pointer.jl | 21 +++++--- src/device/runtime.jl | 12 ++--- src/device/tools.jl | 16 ++++++ src/execution.jl | 93 +++++++++++++++++++++++--------- src/reflection.jl | 18 +++---- test/codegen.jl | 6 +-- test/device/cuda.jl | 24 --------- test/device/execution.jl | 17 +++--- test/device/wmma.jl | 4 +- test/examples.jl | 12 ++++- test/init.jl | 2 +- test/runtests.jl | 24 ++++----- test/util.jl | 20 ++++--- 25 files changed, 279 insertions(+), 230 deletions(-) delete mode 100644 src/compiler.jl create mode 100644 src/device/tools.jl diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 2cb47515..c4f09dea 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -26,7 +26,7 @@ julia:1.4: - .test tags: - nvidia - - sm_70 + - sm_75 variables: CI_THOROUGH: 'true' @@ -36,11 +36,12 @@ julia:1.4-debug: - .test tags: - nvidia - - sm_70 + - sm_75 variables: CI_THOROUGH: 'true' CI_CLONE_ARGS: '-b v1.4.0' CI_BUILD_ARGS: 'BINARYBUILDER_LLVM_ASSERTS=1 debug' + allow_failure: true julia:nightly: extends: diff --git a/Manifest.toml b/Manifest.toml index 7c725cff..dc61c3bd 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -10,15 +10,15 @@ version = "1.0.1" uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" [[BinaryProvider]] -deps = ["Libdl", "Logging", "SHA"] -git-tree-sha1 = "428e9106b1ff27593cbd979afac9b45b82372b8c" +deps = ["Libdl", "SHA"] +git-tree-sha1 = "5b08ed6036d9d3f0ee6369410b830f8873d4024c" uuid = "b99e7846-7c00-51b0-8f62-c81ae34c0232" -version = "0.5.9" +version = "0.5.8" [[CEnum]] -git-tree-sha1 = "1b77a77c3b28e0b3f413f7567c9bb8dd9bdccd14" +git-tree-sha1 = "62847acab40e6855a9b5905ccb99c2b5cf6b3ebb" uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.3.0" +version = "0.2.0" [[CUDAapi]] deps = ["Libdl", "Logging"] @@ -28,20 +28,32 @@ version = "4.0.0" [[CUDAdrv]] deps = ["CEnum", "CUDAapi", "Printf"] -git-tree-sha1 = "f56bbf18c86bcff7a961a32a4947a5abb2963a29" +git-tree-sha1 = "e650cbaee92b60433313157926b1e80d0c3a0e2e" uuid = "c5f51814-7f29-56b8-a69c-e4d8f6be1fde" -version = "6.3.0" +version = "6.2.2" [[Cassette]] -git-tree-sha1 = "ff6f5109371926beb67ec3101be17d2c211e497d" +git-tree-sha1 = "f6a148cadd38ba328bd2c03442037ef801a6aa05" uuid = "7057c7e9-c182-5462-911a-8362d720325c" -version = "0.3.3" +version = "0.3.1" + +[[CodeTracking]] +deps = ["InteractiveUtils", "UUIDs"] +git-tree-sha1 = "0becdab7e6fbbcb7b88d8de5b72e5bb2f28239f3" +uuid = "da1fd8a2-8d9e-5ec2-8556-3022fb5608a2" +version = "0.5.8" + +[[Cthulhu]] +deps = ["CodeTracking", "InteractiveUtils", "REPL", "Unicode"] +git-tree-sha1 = "a4849ec61df9659423cc63b298ed895904ee9743" +uuid = "f68482b8-f384-11e8-15f7-abe071a5a75f" +version = "1.0.2" [[DataStructures]] deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "7d7578b00789cf16c5f68fad71868e773edd58a2" +git-tree-sha1 = "4dead20a1606a60292529023d6eac18a1ef6432e" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.16" +version = "0.17.12" [[Dates]] deps = ["Printf"] @@ -52,17 +64,17 @@ deps = ["Random", "Serialization", "Sockets"] uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" [[ExprTools]] -git-tree-sha1 = "6f0517056812fd6aa3af23d4b70d5325a2ae4e95" +git-tree-sha1 = "08c1f74d9ad03acf0ee84c12c9e665ab1a9a6e33" uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.1" +version = "0.1.0" [[GPUCompiler]] -deps = ["DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs", "UUIDs"] -git-tree-sha1 = "fa8dafad0ce15cfee2fa51e33ad4a743c1abdcca" -repo-rev = "2bb7f5c5224a6ff25ddccf910936ebc0b2a65273" -repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl.git" +deps = ["Cthulhu", "DataStructures", "InteractiveUtils", "LLVM", "Libdl", "TimerOutputs"] +git-tree-sha1 = "96e13df51390dd81625fd29857d53331dadbf13e" +repo-rev = "master" +repo-url = "https://github.com/JuliaGPU/GPUCompiler.jl" uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.2.0" +version = "0.1.0" [[GPUifyLoops]] deps = ["Cassette", "Requires", "StaticArrays"] @@ -76,9 +88,9 @@ uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[LLVM]] deps = ["CEnum", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "93d2e1e960fe47db1a9015e86fad1d47cf67cf59" +git-tree-sha1 = "b6b86801ae2f2682e0a4889315dc76b68db2de71" uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "1.4.1" +version = "1.3.4" [[LibGit2]] deps = ["Printf"] @@ -99,9 +111,10 @@ deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[OrderedCollections]] -git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.2.0" +version = "1.1.0" [[Pkg]] deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] @@ -140,9 +153,9 @@ uuid = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" [[StaticArrays]] deps = ["LinearAlgebra", "Random", "Statistics"] -git-tree-sha1 = "5c06c0aeb81bef54aed4b3f446847905eb6cbda0" +git-tree-sha1 = "5a3bcb6233adabde68ebc97be66e95dcb787424c" uuid = "90137ffa-7385-5640-81b9-e52037218182" -version = "0.12.3" +version = "0.12.1" [[Statistics]] deps = ["LinearAlgebra", "SparseArrays"] @@ -154,9 +167,9 @@ uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [[TimerOutputs]] deps = ["Printf"] -git-tree-sha1 = "0cc8db57cb537191b02948d4fabdc09eb7f31f98" +git-tree-sha1 = "311765af81bbb48d7bad01fb016d9c328c6ede03" uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.5" +version = "0.5.3" [[UUIDs]] deps = ["Random", "SHA"] diff --git a/Project.toml b/Project.toml index 37b74953..2f28a1e4 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "CUDAnative" uuid = "be33ccc6-a3ff-5ff2-a52e-74243cff1e17" -version = "3.1.0" +version = "3.0.4" [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" @@ -20,11 +20,10 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" [compat] Adapt = "0.4, 1.0" BinaryProvider = "0.3, 0.4, 0.5" -CEnum = "0.2, 0.3, 0.4" +CEnum = "0.2" CUDAapi = "3.1, 4.0" CUDAdrv = "6.2.1" ExprTools = "0.1" -GPUCompiler = "0.2" LLVM = "1.3.4" julia = "1.3" diff --git a/docs/Manifest.toml b/docs/Manifest.toml index 6aea55e4..a3ca4d56 100644 --- a/docs/Manifest.toml +++ b/docs/Manifest.toml @@ -19,9 +19,9 @@ version = "0.8.1" [[Documenter]] deps = ["Base64", "Dates", "DocStringExtensions", "InteractiveUtils", "JSON", "LibGit2", "Logging", "Markdown", "REPL", "Test", "Unicode"] -git-tree-sha1 = "395fa1554c69735802bba37d9e7d9586fd44326c" +git-tree-sha1 = "3bacd94d853a6bccaee1d0104d8b06d29a7506ac" uuid = "e30172f5-a6a5-5a46-863b-614d45cd2de4" -version = "0.24.11" +version = "0.24.6" [[InteractiveUtils]] deps = ["Markdown"] @@ -34,7 +34,6 @@ uuid = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" version = "0.21.0" [[LibGit2]] -deps = ["Printf"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[Libdl]] @@ -52,12 +51,12 @@ uuid = "a63ad114-7e13-5084-954f-fe012c677804" [[Parsers]] deps = ["Dates", "Test"] -git-tree-sha1 = "f0abb338b4d00306500056a3fd44c221b8473ef2" +git-tree-sha1 = "0c16b3179190d3046c073440d94172cfc3bb0553" uuid = "69de0a69-1ddd-5017-9359-2bf0b02dc9f0" -version = "1.0.4" +version = "0.3.12" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[Printf]] diff --git a/res/Manifest.toml b/res/Manifest.toml index 9bdf2210..d78301af 100644 --- a/res/Manifest.toml +++ b/res/Manifest.toml @@ -10,38 +10,41 @@ version = "0.2.0" [[CSTParser]] deps = ["Tokenize"] -git-tree-sha1 = "a2f9009a81b92d078a682d4a8576adc1f8176e90" +git-tree-sha1 = "7d10b92c4d9951ccf3009d960d9b66883c174474" uuid = "00ebfdb7-1f24-5e51-bd34-a7502290713f" -version = "2.3.0" +version = "2.2.0" [[Clang]] deps = ["CEnum", "DataStructures", "LLVM_jll", "Libdl"] -git-tree-sha1 = "7a3b75a028f049ccf1bd835fc09a84dd813b5da6" +git-tree-sha1 = "98d24455089ea8567eaae53ebd51060aff1dac41" uuid = "40e3b903-d033-50b4-a0cc-940c62c95e31" -version = "0.11.0" +version = "0.10.1" [[DataStructures]] deps = ["InteractiveUtils", "OrderedCollections"] -git-tree-sha1 = "6166ecfaf2b8bbf2b68d791bc1d54501f345d314" +git-tree-sha1 = "5a431d46abf2ef2a4d5d00bd0ae61f651cf854c8" uuid = "864edb3b-99cc-5e75-8d2d-829cb0a9cfe8" -version = "0.17.15" +version = "0.17.10" [[Dates]] deps = ["Printf"] uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" +[[Distributed]] +deps = ["Random", "Serialization", "Sockets"] +uuid = "8ba89e20-285c-5b6f-9357-94700520ee1b" + [[InteractiveUtils]] deps = ["Markdown"] uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" [[LLVM_jll]] deps = ["Libdl", "Pkg"] -git-tree-sha1 = "c037c15f36c185c613e5b2589d5833720dab3f76" +git-tree-sha1 = "8feeef069d771f4fc90935d8b1f6da3d82f9b96b" uuid = "86de99a1-58d6-5da7-8064-bd56ce2e322c" -version = "8.0.1+0" +version = "6.0.1+0" [[LibGit2]] -deps = ["Printf"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[Libdl]] @@ -55,12 +58,13 @@ deps = ["Base64"] uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" [[OrderedCollections]] -git-tree-sha1 = "12ce190210d278e12644bcadf5b21cbdcf225cd3" +deps = ["Random", "Serialization", "Test"] +git-tree-sha1 = "c4c13474d23c60d20a67b217f1d7f22a40edf8f1" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.2.0" +version = "1.1.0" [[Pkg]] -deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "UUIDs"] +deps = ["Dates", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Test", "UUIDs"] uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [[Printf]] @@ -84,10 +88,14 @@ uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" [[Sockets]] uuid = "6462fe0b-24de-5631-8697-dd941f90decc" +[[Test]] +deps = ["Distributed", "InteractiveUtils", "Logging", "Random"] +uuid = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + [[Tokenize]] -git-tree-sha1 = "73c00ad506d88a7e8e4f90f48a70943101728227" +git-tree-sha1 = "c3aab236f122445406cf7a6de8af0b794da5a950" uuid = "0796e94c-ce3b-5d07-9a54-7f471281c624" -version = "0.5.8" +version = "0.5.7" [[UUIDs]] deps = ["Random", "SHA"] diff --git a/src/CUDAnative.jl b/src/CUDAnative.jl index 03bfb17a..af223219 100644 --- a/src/CUDAnative.jl +++ b/src/CUDAnative.jl @@ -6,8 +6,6 @@ using CUDAdrv using LLVM using LLVM.Interop -using GPUCompiler - using Adapt @@ -37,7 +35,6 @@ const configure_lock = ReentrantLock() @noinline function _functional(show_reason::Bool=false) lock(configure_lock) do if configured[] === nothing - configured[] = false if __configure__(show_reason) configured[] = true try @@ -46,6 +43,8 @@ const configure_lock = ReentrantLock() configured[] = false rethrow() end + else + configured[] = false end end end @@ -62,23 +61,35 @@ end ## source code includes -include("init.jl") -include("compatibility.jl") -include("bindeps.jl") - -include("cupti/CUPTI.jl") -include("nvtx/NVTX.jl") - # needs to be loaded _before_ the compiler infrastructure, because of generated functions +include("device/tools.jl") include("device/pointer.jl") include("device/array.jl") include("device/cuda.jl") include("device/llvm.jl") -include("device/runtime.jl") include("device/tiling.jl") include("device/matmul_kernels.jl") -include("compiler.jl") +using GPUCompiler +include("device/runtime.jl") + +CUDACompilerTarget(args...; kwargs...) = PTXCompilerTarget(args...; + runtime_module=CUDAnative, + # filter out functions from libdevice and cudadevrt + isintrinsic_hook = fn->(fn=="__nvvm_reflect" || startswith(fn, "cuda")), + kwargs...) +CUDACompilerJob(args...; kwargs...) = PTXCompilerJob(args...; + rewrite_ir_hook = (job,mod)->emit_exception_flag!(mod), + link_library_hook = (job,mod,fns)->link_libdevice!(mod, job.target.cap, fns), + kwargs...) + +include("init.jl") +include("compatibility.jl") +include("bindeps.jl") + +include("cupti/CUPTI.jl") +include("nvtx/NVTX.jl") + include("execution.jl") include("exceptions.jl") include("reflection.jl") diff --git a/src/compiler.jl b/src/compiler.jl deleted file mode 100644 index 64ab2158..00000000 --- a/src/compiler.jl +++ /dev/null @@ -1,27 +0,0 @@ -struct CUDACompilerParams <: AbstractCompilerParams end - -CUDACompilerJob = CompilerJob{PTXCompilerTarget,CUDACompilerParams} - -GPUCompiler.runtime_module(::CUDACompilerJob) = CUDAnative - -# filter out functions from libdevice and cudadevrt -GPUCompiler.isintrinsic(job::CUDACompilerJob, fn::String) = - invoke(GPUCompiler.isintrinsic, - Tuple{CompilerJob{PTXCompilerTarget}, typeof(fn)}, - job, fn) || - fn == "__nvvm_reflect" || startswith(fn, "cuda") - -function GPUCompiler.finish_module!(job::CUDACompilerJob, mod::LLVM.Module) - invoke(GPUCompiler.finish_module!, - Tuple{CompilerJob{PTXCompilerTarget}, typeof(mod)}, - job, mod) - emit_exception_flag!(mod) -end - -function GPUCompiler.link_libraries!(job::CUDACompilerJob, mod::LLVM.Module, - undefined_fns::Vector{String}) - invoke(GPUCompiler.link_libraries!, - Tuple{CompilerJob{PTXCompilerTarget}, typeof(mod), typeof(undefined_fns)}, - job, mod, undefined_fns) - link_libdevice!(mod, job.target.cap, undefined_fns) -end diff --git a/src/device/array.jl b/src/device/array.jl index faea3e4f..dee0d41f 100644 --- a/src/device/array.jl +++ b/src/device/array.jl @@ -1,6 +1,7 @@ # Contiguous on-device arrays -export CuDeviceArray, CuDeviceVector, CuDeviceMatrix, ldg +export + CuDeviceArray, CuDeviceVector, CuDeviceMatrix, CuBoundsError, ldg ## construction diff --git a/src/device/cuda.jl b/src/device/cuda.jl index 758999a0..4e2772ad 100644 --- a/src/device/cuda.jl +++ b/src/device/cuda.jl @@ -11,9 +11,7 @@ include("cuda/assertion.jl") include("cuda/memory_dynamic.jl") include("cuda/atomics.jl") include("cuda/misc.jl") -if VERSION >= v"1.4.1" include("cuda/wmma.jl") -end # functionality from libdevice # diff --git a/src/device/cuda/memory_shared.jl b/src/device/cuda/memory_shared.jl index 99ed6e03..c19d5692 100644 --- a/src/device/cuda/memory_shared.jl +++ b/src/device/cuda/memory_shared.jl @@ -61,6 +61,7 @@ end end T_ptr = convert(LLVMType, DevicePtr{T,AS.Shared}) + T_actual_ptr = LLVM.PointerType(eltyp) # create a function llvm_f, _ = create_function(T_ptr) @@ -91,9 +92,10 @@ end entry = BasicBlock(llvm_f, "entry", JuliaContext()) position!(builder, entry) - ptr = gep!(builder, gv, [ConstantInt(0, JuliaContext()), - ConstantInt(0, JuliaContext())]) + ptr_with_as = gep!(builder, gv, [ConstantInt(0, JuliaContext()), + ConstantInt(0, JuliaContext())]) + ptr = addrspacecast!(builder, ptr_with_as, T_actual_ptr) val = ptrtoint!(builder, ptr, T_ptr) ret!(builder, val) end diff --git a/src/device/cuda/output.jl b/src/device/cuda/output.jl index 956bed43..859cdb69 100644 --- a/src/device/cuda/output.jl +++ b/src/device/cuda/output.jl @@ -211,23 +211,3 @@ macro cuprintln(parts...) CUDAnative.@cuprint($(parts...), "\n") end) end - -export @cushow - -""" - @cushow(ex) - -GPU analog of `Base.@show`. It comes with the same type restrictions as [@cuprint](@ref). -```julia -@cushow threadIdx().x -``` -""" -macro cushow(ex) - val = gensym("val") - s = string(ex) - quote - $val = $(esc(ex)) - CUDAnative.@cuprintln($(Expr(:string, s, " = ", val))) - $val - end -end diff --git a/src/device/cuda/wmma.jl b/src/device/cuda/wmma.jl index 088166c8..b8c75571 100644 --- a/src/device/cuda/wmma.jl +++ b/src/device/cuda/wmma.jl @@ -7,6 +7,9 @@ using CUDAnative: AS, DevicePtr # CONSTANTS ################################################################################ +# Determines whether or not to Core.AddrSpacePtr is available +const addrspaceptr_available = (VERSION >= v"1.5.0-DEV.324") + # Maps PTX types to Julia array types const map_ptx_to_jl_array = Dict( "f16" => Float16, @@ -49,14 +52,24 @@ get_frag_info(matrix, ptx_el_type) = ( get_addrspace_info(addr_space) = convert(Int, map_ptx_as_to_as_ty[addr_space]) +if addrspaceptr_available @generated function Base.cconvert(::Type{Core.AddrSpacePtr{T, as}}, x::DevicePtr{T, AS}) where {T, as, AS} - ir = "%ptr = inttoptr i64 %0 to i8 addrspace($as)* - ret i8 addrspace($as)* %ptr" + # Addrspacecast from i8* to i8* is invalid in LLVM + if as == 0 + return quote + return Base.bitcast(Core.AddrSpacePtr{T, as}, x) + end + else + ir = "%p = inttoptr i64 %0 to i8* + %ptr = addrspacecast i8* %p to i8 addrspace($as)* + ret i8 addrspace($as)* %ptr" - return quote - return Base.llvmcall($ir, Core.AddrSpacePtr{T, as}, Tuple{Int64}, Base.bitcast(Int64, x)) + return quote + return Base.llvmcall($ir, Core.AddrSpacePtr{T, as}, Tuple{Int64}, Base.bitcast(Int64, x)) + end end end +end # Fix for https://github.com/JuliaGPU/CUDAnative.jl/issues/587. # Instead of ccall'ing the intrinsics with NTuple{N, T} (which gets lowered to @@ -128,7 +141,7 @@ for mat in ["a", "b", "c"], ccall_name = "extern $llvm_intr" - ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int} + ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty} struct_ty = Symbol("LLVMStruct$sz") @eval $func_name(src_addr, stride) = convert(NTuple{$sz, $frag_ty}, ccall($ccall_name, llvmcall, $struct_ty{$frag_ty}, ($ptr_ty, Int32), src_addr, stride)) @@ -183,7 +196,7 @@ for mat in ["d"], frag_types = ntuple(i -> frag_ty, sz) frag_vars = ntuple(i -> :(data[$i]), sz) - ptr_ty = Core.AddrSpacePtr{arr_ty, addr_space_int} + ptr_ty = addrspaceptr_available ? Core.AddrSpacePtr{arr_ty, addr_space_int} : Ref{arr_ty} @eval $func_name(dst_addr, data, stride) = ccall($ccall_name, llvmcall, Nothing, ($ptr_ty, $(frag_types...), Int32), dst_addr, $(frag_vars...), stride) @eval export $func_name diff --git a/src/device/pointer.jl b/src/device/pointer.jl index d335dab7..e6596d1e 100644 --- a/src/device/pointer.jl +++ b/src/device/pointer.jl @@ -118,7 +118,7 @@ Base.:(+)(x::Integer, y::DevicePtr) = y + x T_int = convert(LLVMType, Int) T_ptr = convert(LLVMType, DevicePtr{T,A}) - T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, A)) + T_actual_ptr = LLVM.PointerType(eltyp) # create a function param_types = [T_ptr, T_int] @@ -130,8 +130,10 @@ Base.:(+)(x::Integer, y::DevicePtr) = y + x position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) + ptr = gep!(builder, ptr, [parameters(llvm_f)[2]]) - ld = load!(builder, ptr) + ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) + ld = load!(builder, ptr_with_as) if A != AS.Generic metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(A) @@ -151,7 +153,7 @@ end T_int = convert(LLVMType, Int) T_ptr = convert(LLVMType, DevicePtr{T,A}) - T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, A)) + T_actual_ptr = LLVM.PointerType(eltyp) # create a function param_types = [T_ptr, eltyp, T_int] @@ -163,9 +165,11 @@ end position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) + ptr = gep!(builder, ptr, [parameters(llvm_f)[3]]) + ptr_with_as = addrspacecast!(builder, ptr, LLVM.PointerType(eltyp, convert(Int, A))) val = parameters(llvm_f)[2] - st = store!(builder, val, ptr) + st = store!(builder, val, ptr_with_as) if A != AS.Generic metadata(st)[LLVM.MD_tbaa] = tbaa_addrspace(A) @@ -197,7 +201,8 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, T_int32 = LLVM.Int32Type(JuliaContext()) T_ptr = convert(LLVMType, DevicePtr{T,AS.Global}) - T_actual_ptr = LLVM.PointerType(eltyp, convert(Int, AS.Global)) + T_actual_ptr = LLVM.PointerType(eltyp) + T_actual_ptr_as = LLVM.PointerType(eltyp, convert(Int, AS.Global)) # create a function param_types = [T_ptr, T_int] @@ -217,7 +222,7 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, "llvm.nvvm.ldg.global.$class.$typ.p1$typ" end mod = LLVM.parent(llvm_f) - intrinsic_typ = LLVM.FunctionType(eltyp, [T_actual_ptr, T_int32]) + intrinsic_typ = LLVM.FunctionType(eltyp, [T_actual_ptr_as, T_int32]) intrinsic = LLVM.Function(mod, intrinsic_name, intrinsic_typ) # generate IR @@ -226,9 +231,11 @@ const LDGTypes = Union{UInt8, UInt16, UInt32, UInt64, position!(builder, entry) ptr = inttoptr!(builder, parameters(llvm_f)[1], T_actual_ptr) + ptr = gep!(builder, ptr, [parameters(llvm_f)[2]]) + ptr_with_as = addrspacecast!(builder, ptr, T_actual_ptr_as) ld = call!(builder, intrinsic, - [ptr, ConstantInt(Int32(align), JuliaContext())]) + [ptr_with_as, ConstantInt(Int32(align), JuliaContext())]) metadata(ld)[LLVM.MD_tbaa] = tbaa_addrspace(AS.Global) diff --git a/src/device/runtime.jl b/src/device/runtime.jl index 7107e11e..c1cee617 100644 --- a/src/device/runtime.jl +++ b/src/device/runtime.jl @@ -8,10 +8,9 @@ GPUCompiler.reset_runtime() # load or build the runtime for the most likely compilation job given a compute capability function load_runtime(cap::VersionNumber) - target = PTXCompilerTarget(; cap=cap) - dummy_source = FunctionSpec(()->return, Tuple{}) - params = CUDACompilerParams() - job = CompilerJob(target, dummy_source, params) + target = CUDACompilerTarget(cap) + dummy_spec = FunctionSpec(()->return, Tuple{}) + job = CUDACompilerJob(target, dummy_spec) GPUCompiler.load_runtime(job) end @@ -39,10 +38,7 @@ function report_exception(ex) return end -function report_oom(sz) - @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) - return -end +report_oom(sz) = @cuprintf("ERROR: Out of dynamic GPU memory (trying to allocate %i bytes)\n", sz) function report_exception_name(ex) @cuprintf(""" diff --git a/src/device/tools.jl b/src/device/tools.jl new file mode 100644 index 00000000..d9767eca --- /dev/null +++ b/src/device/tools.jl @@ -0,0 +1,16 @@ +# Tools for implementing device functionality + +function tbaa_make_child(name::String, constant::Bool=false; ctx::LLVM.Context=JuliaContext()) + tbaa_root = MDNode([MDString("ptxtbaa", ctx)], ctx) + tbaa_struct_type = + MDNode([MDString("ptxtbaa_$name", ctx), + tbaa_root, + LLVM.ConstantInt(0, ctx)], ctx) + tbaa_access_tag = + MDNode([tbaa_struct_type, + tbaa_struct_type, + LLVM.ConstantInt(0, ctx), + LLVM.ConstantInt(constant ? 1 : 0, ctx)], ctx) + + return tbaa_access_tag +end diff --git a/src/execution.jl b/src/execution.jl index ca82bfbc..f02ca6b5 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -3,6 +3,65 @@ export @cuda, cudaconvert, cufunction, dynamic_cufunction, nextwarp, prevwarp +## helper functions + +# split keyword arguments to `@cuda` into ones affecting the macro itself, the compiler and +# the code it generates, or the execution +function split_kwargs(kwargs) + macro_kws = [:dynamic] + compiler_kws = [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name] + call_kws = [:cooperative, :blocks, :threads, :config, :shmem, :stream] + macro_kwargs = [] + compiler_kwargs = [] + call_kwargs = [] + for kwarg in kwargs + if Meta.isexpr(kwarg, :(=)) + key,val = kwarg.args + if isa(key, Symbol) + if key in macro_kws + push!(macro_kwargs, kwarg) + elseif key in compiler_kws + push!(compiler_kwargs, kwarg) + elseif key in call_kws + push!(call_kwargs, kwarg) + else + throw(ArgumentError("unknown keyword argument '$key'")) + end + else + throw(ArgumentError("non-symbolic keyword '$key'")) + end + else + throw(ArgumentError("non-keyword argument like option '$kwarg'")) + end + end + + return macro_kwargs, compiler_kwargs, call_kwargs +end + +# assign arguments to variables, handle splatting +function assign_args!(code, args) + # handle splatting + splats = map(arg -> Meta.isexpr(arg, :(...)), args) + args = map(args, splats) do arg, splat + splat ? arg.args[1] : arg + end + + # assign arguments to variables + vars = Tuple(gensym() for arg in args) + map(vars, args) do var,arg + push!(code.args, :($var = $arg)) + end + + # convert the arguments, compile the function and call the kernel + # while keeping the original arguments alive + var_exprs = map(vars, args, splats) do var, arg, splat + splat ? Expr(:(...), var) : var + end + + return vars, var_exprs +end + + ## high-level @cuda interface """ @@ -53,19 +112,9 @@ macro cuda(ex...) args = call.args[2:end] code = quote end + macro_kwargs, compiler_kwargs, call_kwargs = split_kwargs(kwargs) vars, var_exprs = assign_args!(code, args) - # group keyword argument - macro_kwargs, compiler_kwargs, call_kwargs, other_kwargs = - split_kwargs(kwargs, - [:dynamic], - [:minthreads, :maxthreads, :blocks_per_sm, :maxregs, :name], - [:cooperative, :blocks, :threads, :config, :shmem, :stream]) - if !isempty(other_kwargs) - key,val = first(other_kwargs).args - throw(ArgumentError("Unsupported keyword argument '$key'")) - end - # handle keyword arguments that influence the macro's behavior dynamic = false for kwarg in macro_kwargs @@ -170,13 +219,8 @@ AbstractKernel sig = Base.signature_type(F, TT) args = (:F, (:( args[$i] ) for i in 1:length(args))...) - # filter out arguments that shouldn't be passed - predicate = if VERSION >= v"1.5.0-DEV.581" - dt -> isghosttype(dt) || Core.Compiler.isconstType(dt) - else - dt -> isghosttype(dt) - end - to_pass = map(!predicate, sig.parameters) + # filter out ghost arguments that shouldn't be passed + to_pass = map(!isghosttype, sig.parameters) call_t = Type[x[1] for x in zip(sig.parameters, to_pass) if x[2]] call_args = Union{Expr,Symbol}[x[1] for x in zip(args, to_pass) if x[2]] @@ -292,19 +336,18 @@ function cufunction(f::Core.Function, tt::Type=Tuple{}; name=nothing, kwargs...) env = hash(pointer_from_objref(ctx)) # contexts are unique, but handles might alias # TODO: implement this as a hash function in CUDAdrv - source = FunctionSpec(f, tt, true, name) - GPUCompiler.cached_compilation(_cufunction, source, env; kwargs...)::HostKernel{f,tt} + spec = FunctionSpec(f, tt, true, name) + GPUCompiler.cached_compilation(_cufunction, spec, env; kwargs...)::HostKernel{f,tt} end # actual compilation -function _cufunction(source::FunctionSpec; kwargs...) +function _cufunction(spec::FunctionSpec; kwargs...) # compile to PTX ctx = context() dev = device(ctx) cap = supported_capability(dev) - target = PTXCompilerTarget(; cap=supported_capability(dev), kwargs...) - params = CUDACompilerParams() - job = CompilerJob(target, source, params) + target = CUDACompilerTarget(supported_capability(dev)) + job = CUDACompilerJob(target, spec; kwargs...) asm, kernel_fn, undefined_fns = GPUCompiler.compile(:asm, job; strict=true) # settings to JIT based on Julia's debug setting @@ -334,7 +377,7 @@ function _cufunction(source::FunctionSpec; kwargs...) # JIT into an executable kernel object mod = CuModule(image, jit_options) fun = CuFunction(mod, kernel_fn) - kernel = HostKernel{source.f,source.tt}(ctx, mod, fun) + kernel = HostKernel{spec.f,spec.tt}(ctx, mod, fun) create_exceptions!(mod) diff --git a/src/reflection.jl b/src/reflection.jl index dde6fd3b..55acafca 100644 --- a/src/reflection.jl +++ b/src/reflection.jl @@ -44,13 +44,12 @@ See also: [`@device_code_sass`](@ref) function code_sass(io::IO, @nospecialize(func), @nospecialize(types), kernel::Bool=true; verbose::Bool=false, kwargs...) tt = Base.to_tuple_type(types) - target = PTXCompilerTarget(; cap=supported_capability(device()), kwargs...) - params = CUDACompilerParams() - job = CompilerJob(target, FunctionSpec(func, tt, kernel), params) + target = CUDACompilerTarget(supported_capability(device())) + job = CUDACompilerJob(target, FunctionSpec(func, tt, kernel); kwargs...) code_sass(io, job; verbose=verbose) end -function code_sass(io::IO, job::CUDACompilerJob; verbose::Bool=false) +function code_sass(io::IO, job::PTXCompilerJob; verbose::Bool=false) if !job.source.kernel error("Can only generate SASS code for kernel functions") end @@ -113,11 +112,10 @@ for method in (:code_typed, :code_warntype, :code_llvm, :code_native) kernel::Bool=false, minthreads=nothing, maxthreads=nothing, blocks_per_sm=nothing, maxregs=nothing, kwargs...) source = FunctionSpec(func, Base.to_tuple_type(types), kernel) - target = PTXCompilerTarget(; cap=supported_capability(device()), - minthreads=minthreads, maxthreads=maxthreads, - blocks_per_sm=blocks_per_sm, maxregs=maxregs) - params = CUDACompilerParams() - job = CompilerJob(target, source, params) + target = CUDACompilerTarget(supported_capability(device())) + job = CUDACompilerJob(target, source; + minthreads=minthreads, maxthreads=maxthreads, + blocks_per_sm=blocks_per_sm, maxregs=maxregs) GPUCompiler.$method($(args...); kwargs...) end $method(@nospecialize(func), @nospecialize(types); kwargs...) = @@ -145,7 +143,7 @@ Evaluates the expression `ex` and prints the result of [`CUDAnative.code_sass`]( [`CUDAnative.code_sass`](@ref). """ macro device_code_sass(ex...) - function hook(job::CUDACompilerJob; io::IO=stdout, kwargs...) + function hook(job::PTXCompilerJob; io::IO=stdout, kwargs...) println(io, "// $job") println(io) code_sass(io, job; kwargs...) diff --git a/test/codegen.jl b/test/codegen.jl index cd3f00d0..ab2874b5 100644 --- a/test/codegen.jl +++ b/test/codegen.jl @@ -24,13 +24,13 @@ end ir = sprint(io->CUDAnative.code_llvm(io, f, Tuple{CUDAnative.DevicePtr{Float32,AS.Global}}; dump_module=true, raw=true)) - @test occursin("gputbaa_global", ir) + @test occursin("ptxtbaa_global", ir) # no TBAA on generic pointers ir = sprint(io->CUDAnative.code_llvm(io, f, Tuple{CUDAnative.DevicePtr{Float32,AS.Generic}}; dump_module=true, raw=true)) - @test !occursin("gputbaa", ir) + @test !occursin("ptxtbaa", ir) end @@ -39,7 +39,7 @@ end ir = sprint(io->CUDAnative.code_llvm(io, cached_load, Tuple{CUDAnative.DevicePtr{Float32,AS.Global}}; dump_module=true, raw=true)) - @test occursin("gputbaa_global", ir) + @test occursin("ptxtbaa_global", ir) end @testset "ghost values" begin diff --git a/test/device/cuda.jl b/test/device/cuda.jl index 6d43e6b1..ebded25b 100644 --- a/test/device/cuda.jl +++ b/test/device/cuda.jl @@ -393,19 +393,6 @@ end @test out == "42$endline" end -@testset "@cushow" begin - function use_cushow() - seven_i32 = Int32(7) - three_f64 = Float64(3) - @cushow seven_i32 - @cushow three_f64 - @cushow 1f0 + 4f0 - return nothing - end - - _, out = @grab_output @on_device use_cushow() - @test out == "seven_i32 = 7\nthree_f64 = 3.000000\n1.0f0 + 4.0f0 = 5.000000\n" -end ############################################################################################ @@ -1133,17 +1120,6 @@ end end end -@testset "shared memory" begin - function kernel() - shared = @cuStaticSharedMem(Float32, 1) - @atomic shared[threadIdx().x] += 0f0 - return - end - - @cuda kernel() - synchronize() -end - end end diff --git a/test/device/execution.jl b/test/device/execution.jl index 1b3af97a..5ac5d7b3 100644 --- a/test/device/execution.jl +++ b/test/device/execution.jl @@ -558,15 +558,16 @@ end @testset "stack traces at different debug levels" begin script = """ - function kernel(arr, val) - arr[1] = val + function kernel(ptr, val) + unsafe_store!(ptr, Int(val)) return end cpu = zeros(Int) - gpu = CuArray(cpu) - @cuda kernel(gpu, 1.2) - Array(gpu) + gpu = CUDAdrv.Mem.alloc(CUDAdrv.Mem.Device, sizeof(cpu)) + gpu_ptr = convert(CUDAdrv.CuPtr{Int}, gpu) + @cuda kernel(gpu_ptr, 1.2) + unsafe_copyto!(pointer(cpu), gpu_ptr, 1) """ let (code, out, err) = julia_script(script, `-g0`) @@ -591,7 +592,7 @@ let (code, out, err) = julia_script(script, `-g2`) else @test occursin("[1] Int64 at float.jl", out) end - @test occursin("[4] kernel at none:6", out) + @test occursin("[2] kernel at none:2", out) end end @@ -613,8 +614,8 @@ let (code, out, err) = julia_script(script, `-g2`) @test code == 1 @test occursin("ERROR: KernelException: exception thrown during kernel execution on device", err) @test occursin("ERROR: a exception was thrown during kernel execution", out) - @test occursin("foo at none:5", out) - @test occursin("bar at none:6", out) + @test occursin("foo at none:1", out) + @test occursin("bar at none:2", out) end end diff --git a/test/device/wmma.jl b/test/device/wmma.jl index 93b5f44a..44f0a261 100644 --- a/test/device/wmma.jl +++ b/test/device/wmma.jl @@ -1,5 +1,6 @@ # Need https://github.com/JuliaLang/julia/pull/33970 # and https://github.com/JuliaLang/julia/pull/34043 +if VERSION >= v"1.4.0-DEV.666" && capability(device()) >= v"7.0" using CUDAnative.WMMA @@ -230,7 +231,7 @@ is_debug = ccall(:jl_is_debugbuild, Cint, ()) != 0 return end - @test_broken_if v"1.5.0-DEV.393" <= VERSION < v"1.5.0-DEV.851" begin + @test_broken_if VERSION >= v"1.5.0-DEV.393" begin @cuda threads=32 kernel(a_dev, b_dev, c_dev, d_dev, alpha, beta) d = Array(d_dev) @@ -293,3 +294,4 @@ end ################################################################################ end +end diff --git a/test/examples.jl b/test/examples.jl index 414581de..6b7041aa 100644 --- a/test/examples.jl +++ b/test/examples.jl @@ -18,12 +18,20 @@ filter!(file -> readline(file) != "# EXCLUDE FROM TESTING", examples) cd(examples_dir) do examples = relpath.(examples, Ref(examples_dir)) @testset for example in examples - cmd = Base.julia_cmd() + # construct a command + cmd = `$(Base.julia_cmd()) --startup=no` if Base.JLOptions().project != C_NULL + # --project isn't preserved by julia_cmd() cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))` end + cmd = `$cmd $example` - @test success(pipeline(`$cmd $example`, stderr=stderr)) + # run it and conditionally show output + out = Pipe() + rv = success(pipeline(cmd, stdout=out, stderr=out)) + close(out.in) + rv || print(read(out, String)) + @test rv end end diff --git a/test/init.jl b/test/init.jl index 94e7a3d1..b9bd04c1 100644 --- a/test/init.jl +++ b/test/init.jl @@ -110,7 +110,6 @@ candidates = [(device!(dev); # and is used to pick a codegen target regardless of the actual device. cuda_support = CUDAnative.cuda_compat() filter!(x->x.cap in cuda_support.cap, candidates) -isempty(candidates) && error("Could not find any suitable device for this configuration") ## order by available memory, but also by capability if testing needs to be thorough thorough = parse(Bool, get(ENV, "CI_THOROUGH", "false")) if thorough @@ -118,6 +117,7 @@ if thorough else sort!(candidates, by=x->x.mem) end +isempty(candidates) && error("Could not find any suitable device for this configuration") pick = last(candidates) @info("Testing using device $(name(pick.dev)) (compute capability $(pick.cap), $(Base.format_bytes(pick.mem)) available memory) on CUDA driver $(CUDAdrv.version()) and toolkit $(CUDAnative.version())") device!(pick.dev) diff --git a/test/runtests.jl b/test/runtests.jl index b875f2e9..58a8b5e9 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -23,24 +23,22 @@ if haskey(ENV, "CI") && haskey(ENV, "JULIA_CUDA_VERSION") end length(devices()) > 0 || error("The CUDAnative.jl test suite requires a CUDA device") -include("init.jl") -include("pointer.jl") -include("codegen.jl") +#= include("init.jl") =# +#= include("pointer.jl") =# +#= include("codegen.jl") =# capability(device()) >= v"2.0" || error("The CUDAnative.jl test suite requires a CUDA device with compute capability 2.0 or higher") -include("device/codegen.jl") -include("device/execution.jl") -include("device/pointer.jl") -include("device/array.jl") -include("device/cuda.jl") -if VERSION >= v"1.4.1" && capability(device()) >= v"7.0" -include("device/wmma.jl") +#= include("device/codegen.jl") =# +#= include("device/execution.jl") =# +#= include("device/pointer.jl") =# +#= include("device/array.jl") =# +#= include("device/cuda.jl") =# +#= include("device/wmma.jl") =# include("device/tiling.jl") include("device/matmul_kernels.jl") -end -include("nvtx.jl") +#= include("nvtx.jl") =# -include("examples.jl") +#= include("examples.jl") =# end diff --git a/test/util.jl b/test/util.jl index 0fece1db..05ed6d41 100644 --- a/test/util.jl +++ b/test/util.jl @@ -76,16 +76,22 @@ end function julia_script(code, args=``) # FIXME: this doesn't work when the compute mode is set to exclusive - script = """using CUDAnative, CUDAdrv - const CuArray = CUDAnative.CuHostArray - device!($(device())) - - $code""" - cmd = Base.julia_cmd() + script = "using CUDAnative, CUDAdrv; device!($(device())); $code" + cmd = ``` + $(Base.julia_cmd()) + --code-coverage=$(("none", "user", "all")[Base.JLOptions().code_coverage + 1]) + --color=$(Base.have_color ? "yes" : "no") + --compiled-modules=$(Bool(Base.JLOptions().use_compiled_modules) ? "yes" : "no") + --check-bounds=yes + --startup-file=$(Base.JLOptions().startupfile == 1 ? "yes" : "no") + --track-allocation=$(("none", "user", "all")[Base.JLOptions().malloc_log + 1]) + --eval $script + ``` if Base.JLOptions().project != C_NULL + # --project isn't preserved by julia_cmd() cmd = `$cmd --project=$(unsafe_string(Base.JLOptions().project))` end - cmd = `$cmd --eval $script $args` + cmd = `$cmd $args` out = Pipe() err = Pipe() From 1ff1894cfed81403b585780a3011b45c6b47a133 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 14:10:37 -0400 Subject: [PATCH 30/34] Fixes --- src/device/matmul_kernels/operator.jl | 32 +++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index 65724637..b082996f 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -72,34 +72,34 @@ struct WMMAComplexOp{M, N, K} end @inline function load_a(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{16, 16, 16, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_a(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_a(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_a(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function load_b(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{16, 16, 16, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_b(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_b(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_b(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function load_c(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_c(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_c(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_c(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function store_d(::Type{WMMAComplexOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) WMMA.store_d(pointer(workspace, ind), frag[1], size(workspace)[1], WMMA.ColMajor, conf) - WMMA.store_d(pointer(workspace, ind + prod(size(workspace)[1:2])), frag[2], size(workspace)[1], WMMA.ColMajor, conf) + WMMA.store_d(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), frag[2], size(workspace)[1], WMMA.ColMajor, conf) end using LLVM @@ -146,34 +146,34 @@ struct WMMADualOp{M, N, K} end @inline function load_a(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{16, 16, 16, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_a(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_a(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_a(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function load_b(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{16, 16, 16, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_b(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_b(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_b(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function load_c(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) return (WMMA.load_c(pointer(workspace, ind), size(workspace)[1], WMMA.ColMajor, conf), - WMMA.load_c(pointer(workspace, ind + prod(size(workspace)[1:2])), size(workspace)[1], WMMA.ColMajor, conf)) + WMMA.load_c(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), size(workspace)[1], WMMA.ColMajor, conf)) end @inline function store_d(::Type{WMMADualOp{M, N, K}}, ::Type{Layout.SplitComplex{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - ind = linearise(tile.index, size(workspace)[1:2]) + ind = linearise(tile.index, (size(workspace)[1], size(workspace)[2])) WMMA.store_d(pointer(workspace, ind), frag[1], size(workspace)[1], WMMA.ColMajor, conf) - WMMA.store_d(pointer(workspace, ind + prod(size(workspace)[1:2])), frag[2], size(workspace)[1], WMMA.ColMajor, conf) + WMMA.store_d(pointer(workspace, ind + size(workspace)[1] * size(workspace)[2]), frag[2], size(workspace)[1], WMMA.ColMajor, conf) end @inline function mma(::Type{WMMADualOp{M, N, K}}, a_frag, b_frag, c_frag) where {M, N, K} From 5b066e0e2d7866b4640f822397e0eb59bca753dd Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 16:21:17 -0400 Subject: [PATCH 31/34] Add more plots --- test/perf/matmul_kernels/complex-wmma/plot.jl | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/perf/matmul_kernels/complex-wmma/plot.jl b/test/perf/matmul_kernels/complex-wmma/plot.jl index c274b798..68d72e72 100644 --- a/test/perf/matmul_kernels/complex-wmma/plot.jl +++ b/test/perf/matmul_kernels/complex-wmma/plot.jl @@ -16,6 +16,8 @@ function plot_results(file, label) end plot_results("cudanative.csv", "CUDAnative") +plot_results("cudanative-generic-fp32.csv", "CUDAnative generic (FP32)") +plot_results("cudanative-generic-fp16.csv", "CUDAnative generic (FP16)") plot_results("cutlass.csv", "CUTLASS Example") title!("Performance of mixed-precision complex GEMM\nProblem size: N x N x N") From 48d6a09467e20f6db5ff96c0d516010b82b20fe4 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 16:22:10 -0400 Subject: [PATCH 32/34] Add generic scripts --- .../complex-wmma/cudanative-generic-fp16.sh | 22 ++++++++++++++ .../complex-wmma/cudanative-generic-fp32.sh | 22 ++++++++++++++ .../complex-wmma/cudanative-generic.jl | 30 +++++++++++++++++++ 3 files changed, 74 insertions(+) create mode 100755 test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp16.sh create mode 100755 test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp32.sh create mode 100644 test/perf/matmul_kernels/complex-wmma/cudanative-generic.jl diff --git a/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp16.sh b/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp16.sh new file mode 100755 index 00000000..46259b38 --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp16.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp16.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP16 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp16.csv +done diff --git a/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp32.sh b/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp32.sh new file mode 100755 index 00000000..06343260 --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cudanative-generic-fp32.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp32.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP32 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp32.csv +done diff --git a/test/perf/matmul_kernels/complex-wmma/cudanative-generic.jl b/test/perf/matmul_kernels/complex-wmma/cudanative-generic.jl new file mode 100644 index 00000000..770a6a2e --- /dev/null +++ b/test/perf/matmul_kernels/complex-wmma/cudanative-generic.jl @@ -0,0 +1,30 @@ +using CUDAnative, CuArrays, GPUArrays, CUDAdrv; + +M = parse(Int, ARGS[1]); +N = parse(Int, ARGS[2]); +K = parse(Int, ARGS[3]); + +if ARGS[4] == "FP32" + T = Float32; +elseif ARGS[4] == "FP16" + T = Float16; +else + error("Invalid type: $(ARGS[4])"); +end + +a_h = rand(Complex{T}, (M, K)) / sqrt(T(K)); +b_h = rand(Complex{T}, (K, N)) / sqrt(T(K)); +c_h = rand(Complex{T}, (M, N)); + +a = CuArray(a_h); +b = CuArray(b_h); +c = CuArray(c_h); +d = similar(c); + +# warmup +GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) + +# profile +for i = 1 : 10 + CUDAdrv.@profile GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) +end From 43c645fa217ae92d4a8ab74c0aed638c6d4309b7 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Sat, 30 May 2020 17:55:37 -0400 Subject: [PATCH 33/34] Add dual plots --- test/perf/matmul_kernels/dual-wmma/.gitignore | 2 + .../dual-wmma/cudanative-generic-fp16.sh | 22 +++++++ .../dual-wmma/cudanative-generic-fp32.sh | 22 +++++++ .../dual-wmma/cudanative-generic.jl | 30 ++++++++++ .../matmul_kernels/dual-wmma/cudanative.jl | 60 +++++++++++++++++++ .../matmul_kernels/dual-wmma/cudanative.sh | 22 +++++++ test/perf/matmul_kernels/dual-wmma/plot.jl | 25 ++++++++ 7 files changed, 183 insertions(+) create mode 100644 test/perf/matmul_kernels/dual-wmma/.gitignore create mode 100755 test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp16.sh create mode 100755 test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp32.sh create mode 100644 test/perf/matmul_kernels/dual-wmma/cudanative-generic.jl create mode 100644 test/perf/matmul_kernels/dual-wmma/cudanative.jl create mode 100755 test/perf/matmul_kernels/dual-wmma/cudanative.sh create mode 100644 test/perf/matmul_kernels/dual-wmma/plot.jl diff --git a/test/perf/matmul_kernels/dual-wmma/.gitignore b/test/perf/matmul_kernels/dual-wmma/.gitignore new file mode 100644 index 00000000..5919a65f --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/.gitignore @@ -0,0 +1,2 @@ +*.csv +*.pdf diff --git a/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp16.sh b/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp16.sh new file mode 100755 index 00000000..46259b38 --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp16.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp16.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP16 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp16.csv +done diff --git a/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp32.sh b/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp32.sh new file mode 100755 index 00000000..06343260 --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/cudanative-generic-fp32.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative-generic-fp32.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative-generic.jl $N $N $N FP32 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative-generic-fp32.csv +done diff --git a/test/perf/matmul_kernels/dual-wmma/cudanative-generic.jl b/test/perf/matmul_kernels/dual-wmma/cudanative-generic.jl new file mode 100644 index 00000000..39b78378 --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/cudanative-generic.jl @@ -0,0 +1,30 @@ +using CUDAnative, CuArrays, GPUArrays, CUDAdrv, ForwardDiff; + +M = parse(Int, ARGS[1]); +N = parse(Int, ARGS[2]); +K = parse(Int, ARGS[3]); + +if ARGS[4] == "FP32" + T = Float32; +elseif ARGS[4] == "FP16" + T = Float16; +else + error("Invalid type: $(ARGS[4])"); +end + +a_h = rand(Complex{T}, (M, K)) / sqrt(T(K)); +b_h = rand(Complex{T}, (K, N)) / sqrt(T(K)); +c_h = rand(Complex{T}, (M, N)); + +a = CuArray(reinterpret(ForwardDiff.Dual{T,T,1}, a_h)); +b = CuArray(reinterpret(ForwardDiff.Dual{T,T,1}, b_h)); +c = CuArray(reinterpret(ForwardDiff.Dual{T,T,1}, c_h)); +d = similar(c); + +# warmup +GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) + +# profile +for i = 1 : 10 + CUDAdrv.@profile GPUArrays.generic_matmatmul!(c, b, a, T(1), T(1)) +end diff --git a/test/perf/matmul_kernels/dual-wmma/cudanative.jl b/test/perf/matmul_kernels/dual-wmma/cudanative.jl new file mode 100644 index 00000000..c8360640 --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/cudanative.jl @@ -0,0 +1,60 @@ +using CUDAdrv +using CUDAnative +using CUDAnative.MatMul +using CuArrays + +M = parse(Int, ARGS[1]) +N = parse(Int, ARGS[2]) +K = parse(Int, ARGS[3]) + +function benchmark_matmul(a, b, c, d) + CuArrays.@sync begin + conf = MatMul.get_config( + gemm_shape = (M = M, N = N, K = K), + operator = Operator.WMMADualOp{16, 16, 16}, + + global_a_layout = Layout.InterleavedComplex{Float16}, + global_b_layout = Layout.InterleavedComplex{Float16}, + global_c_layout = Layout.InterleavedComplex{Float32}, + global_d_layout = Layout.InterleavedComplex{Float32}, + + shared_a_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_b_layout = Layout.Padded{Layout.SplitComplex{Float16}, 8}, + shared_c_layout = Layout.SplitComplex{Float32}, + shared_d_layout = Layout.SplitComplex{Float32}, + + warps_per_block = 8, + + compute_warp = (M = 16, N = 32, K = 16), + + block_shape = (M = 64, N = 64, K = 32), + + mem_a_warp = (M = 64, K = 2), + mem_b_warp = (K = 32, N = 4), + mem_cd_warp = (M = 64, N = 1), + + mem_a_thread = (M = 4, K = 1), + mem_b_thread = (K = 4, N = 1), + mem_cd_thread = (M = 2, N = 1) + ) + + MatMul.matmul(a, b, c, d, conf) + end +end + +a_h = rand(Complex{Float16}, (M, K)) / sqrt(Float16(K)); +b_h = rand(Complex{Float16}, (K, N)) / sqrt(Float16(K)); +c_h = rand(Complex{Float32}, (M, N)); + +a = CuArray(a_h); +b = CuArray(b_h); +c = CuArray(c_h); +d = similar(c); + +# warmup +benchmark_matmul(a, b, c, d) + +# profile +for i = 1 : 10 + CUDAdrv.@profile benchmark_matmul(a, b, c, d) +end diff --git a/test/perf/matmul_kernels/dual-wmma/cudanative.sh b/test/perf/matmul_kernels/dual-wmma/cudanative.sh new file mode 100755 index 00000000..f9b5d7ab --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/cudanative.sh @@ -0,0 +1,22 @@ +#!/usr/bin/env bash +set -Eeuo pipefail + +if [[ $# < 1 ]]; then + echo "Usage $0 " 1>&2 + exit 1 +fi + +JULIA_PATH=$1 + +cd "$( dirname "${BASH_SOURCE[0]}" )" + +printf "N,runtime\n" >cudanative.csv + +for i in {7..14}; do + N=$((2**i)) + + # runtime in ns + runtime=$(LD_LIBRARY_PATH=${JULIA_PATH}/usr/lib nv-nsight-cu-cli --profile-from-start off -f --summary per-kernel --csv --units base ${JULIA_PATH}/julia cudanative.jl $N $N $N 2>/dev/null | grep 'gpu__time_duration' | tail -1 | awk -F',' '{print $NF}' | sed 's/"//g') + + printf "$N,$runtime\n" >>cudanative.csv +done diff --git a/test/perf/matmul_kernels/dual-wmma/plot.jl b/test/perf/matmul_kernels/dual-wmma/plot.jl new file mode 100644 index 00000000..b731ab97 --- /dev/null +++ b/test/perf/matmul_kernels/dual-wmma/plot.jl @@ -0,0 +1,25 @@ +using CSV +using DataFrames +using Plots + +pyplot() + +function plot_results(file, label) + df = DataFrame(CSV.File(file)) + + N = df[!, :N] + mean_runtime = df[!, :runtime] .* 1e3 # in ps + + tflops = (6 .* N .^ 3) ./ mean_runtime + + plot!(N, tflops, label=label, xscale=:log2, markershape=:circle) +end + +plot_results("cudanative.csv", "CUDAnative") +plot_results("cudanative-generic-fp32.csv", "CUDAnative generic (FP32)") +plot_results("cudanative-generic-fp16.csv", "CUDAnative generic (FP16)") + +title!("Performance of mixed-precision dual GEMM\nProblem size: N x N x N") +xlabel!("N") +ylabel!("TFLOPS") +savefig("plot.pdf") From 9cb4b3c1763ad14809492a2167c9fe0ffac16783 Mon Sep 17 00:00:00 2001 From: Thomas Faingnaert Date: Fri, 29 May 2020 14:05:03 -0400 Subject: [PATCH 34/34] Add translate variant for offset --- src/device/matmul_kernels/kernel.jl | 8 +++--- src/device/matmul_kernels/operator.jl | 36 ++++++++++++++++++--------- src/device/tiling.jl | 9 +++++++ 3 files changed, 37 insertions(+), 16 deletions(-) diff --git a/src/device/matmul_kernels/kernel.jl b/src/device/matmul_kernels/kernel.jl index d25c502c..baab4e19 100644 --- a/src/device/matmul_kernels/kernel.jl +++ b/src/device/matmul_kernels/kernel.jl @@ -45,7 +45,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds c_frags[i, j] = transf_sh2rf_c(Operator.load_c(OPERATOR, SHARED_C_LAYOUT, shmem_c, tile), tile) end end @@ -84,7 +84,7 @@ function matmul_impl(a, b, c, d, a_frags = MArray{Tuple{NUM_FRAGMENTS_M}, Operator.fragtype_a(OPERATOR, SHARED_A_LAYOUT)}(undef) @unroll for i = 1 : NUM_FRAGMENTS_M - a_tile = translate(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) + a_tile = translate_offset(warp_tile.MK, (M = (i-1)*COMPUTE_OP_SHAPE.M, K = 0)) @inbounds a_frags[i] = transf_sh2rf_a(Operator.load_a(OPERATOR, SHARED_A_LAYOUT, shmem_a, a_tile), a_tile) end @@ -92,7 +92,7 @@ function matmul_impl(a, b, c, d, b_frags = MArray{Tuple{NUM_FRAGMENTS_N}, Operator.fragtype_b(OPERATOR, SHARED_B_LAYOUT)}(undef) @unroll for j = 1 : NUM_FRAGMENTS_N - b_tile = translate(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) + b_tile = translate_offset(warp_tile.KN, (K = 0, N = (j-1)*COMPUTE_OP_SHAPE.N)) @inbounds b_frags[j] = transf_sh2rf_b(Operator.load_b(OPERATOR, SHARED_B_LAYOUT, shmem_b, b_tile), b_tile) end @@ -114,7 +114,7 @@ function matmul_impl(a, b, c, d, @unroll for i = 1 : NUM_FRAGMENTS_M @unroll for j = 1 : NUM_FRAGMENTS_N - tile = translate(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) + tile = translate_offset(warp_tile, (M = (i-1)*COMPUTE_OP_SHAPE.M, N = (j-1)*COMPUTE_OP_SHAPE.N)) Operator.store_d(OPERATOR, SHARED_D_LAYOUT, shmem_d, transf_rf2sh_d(c_frags[i, j], tile), tile) end end diff --git a/src/device/matmul_kernels/operator.jl b/src/device/matmul_kernels/operator.jl index b082996f..1414b594 100644 --- a/src/device/matmul_kernels/operator.jl +++ b/src/device/matmul_kernels/operator.jl @@ -25,31 +25,43 @@ struct WMMAOp{M, N, K} end @inline fragtype_b(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float16}}) = WMMA.Fragment{16, 16, 16, 16, Float16, WMMA.ColMajor, WMMA.MatrixB} @inline fragtype_accum(::Type{WMMAOp{16, 16, 16}}, ::Type{Layout.AlignedColMajor{Float32}}) = WMMA.Fragment{16, 16, 16, 8, Float32, WMMA.Unspecified, WMMA.Accumulator} -function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +@inline function load_a(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) return WMMA.load_a(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} +@inline function load_b(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float16}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float16) return WMMA.load_b(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} +@inline function load_c(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) return WMMA.load_c(ptr, size(workspace, 1), WMMA.ColMajor, conf) end -function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} +@inline function store_d(::Type{WMMAOp{M, N, K}}, ::Type{Layout.AlignedColMajor{Float32}}, workspace, frag, tile::Tile) where {M, N, K} conf = WMMA.Config{M, N, K, Float32} - linear_index = linearise(tile.index, size(workspace)) - ptr = pointer(workspace, linear_index) + + linear_base = linearise(tile.base, size(workspace)) + linear_offset = linearise(tile.offset, size(workspace)) + + ptr = pointer(workspace, linear_base) + (linear_offset - 1) * sizeof(Float32) WMMA.store_d(ptr, frag, size(workspace, 1), WMMA.ColMajor, conf) end diff --git a/src/device/tiling.jl b/src/device/tiling.jl index 1bc52e00..891a6315 100644 --- a/src/device/tiling.jl +++ b/src/device/tiling.jl @@ -132,6 +132,15 @@ end @inline translate(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate(tile, NamedTuple{names}(offset)) +export translate_offset + +@inline function translate_offset(tile::Tile{size, names, T}, offset::NamedTuple{names, T}) where {names, T, size} + new_offset = map(+, tile.offset, offset) + return Tile{size, names, T}(tile.base, new_offset) +end + +@inline translate_offset(tile::Tile{size, names, T}, offset::Tuple) where {names, T, size} = translate_offset(tile, NamedTuple{names}(offset)) + # ------------- # TileIterators # -------------