From 416b28f11e0f4b509e6efca1a06babcd4997d45b Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Fri, 11 Apr 2025 17:00:48 -0300 Subject: [PATCH 1/8] Faster matmul --- src/host/linalg.jl | 95 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/src/host/linalg.jl b/src/host/linalg.jl index 2c928747..b59598f6 100644 --- a/src/host/linalg.jl +++ b/src/host/linalg.jl @@ -325,11 +325,92 @@ function LinearAlgebra.ldiv!(B::AbstractGPUVecOrMat, B end +# XXX: figure out how to do dynamically +MAX_TILE_DIM = 16 ## matrix multiplication # legacy method generic_matmatmul!(C::AbstractArray, A::AbstractArray, B::AbstractArray, a::Number, b::Number) = generic_matmatmul!(C, A, B, MulAddMul(a, b)) +function generic_matmatmul!(C::AbstractGPUMatrix{R}, A::AbstractGPUMatrix{T}, B::AbstractGPUMatrix{S}, add::MulAddMul) where {T<:Number,S<:Number,R<:Number} + N = size(A,1) + Q = size(A,2) + M = size(B,2) + if Q != size(B,1) + throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) + end + if size(C,1) != N || size(C,2) != M + throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((N,M))")) + end + if isempty(A) || isempty(B) + return fill!(C, zero(R)) + end + + @kernel unsafe_indices=true function coalesced_matmul_kernel!( + output, @Const(input1), @Const(input2), N, Q, M, + ::Val{BANK} = Val(1), + ) where {BANK} + grow, gcol = @index(Group, NTuple) + tile_row, tile_col = @index(Local, NTuple) + + TILE_DIM = @uniform @groupsize()[1] + + # +1 to avoid bank conflicts on shared memory + tile1 = @localmem(R, (TILE_DIM + BANK, TILE_DIM)) + tile2 = @localmem(R, (TILE_DIM + BANK, TILE_DIM)) + + # private variable for tile output + outval = @private R 1 + @inbounds outval[1] = -zero(R) + + # number of tiles depends on inner dimension + @uniform NUM_TILES = div(Q + TILE_DIM - 1, TILE_DIM) + + # loop over all tiles needed for this calculation + for t in 0:(NUM_TILES - 1) + I = (grow - 1) * TILE_DIM + tile_row + J = (gcol - 1) * TILE_DIM + tile_col + + # load inputs into tiles, with bounds checking for non-square matrices + if I <= N && t * TILE_DIM + tile_col <= Q + @inbounds tile1[tile_row, tile_col] = input1[I, t * TILE_DIM + tile_col] + else + @inbounds tile1[tile_row, tile_col] = zero(R) + end + if J <= M && t * TILE_DIM + tile_row <= Q + @inbounds tile2[tile_row, tile_col] = input2[t * TILE_DIM + tile_row, J] + else + @inbounds tile2[tile_row, tile_col] = zero(R) + end + + # wait for all tiles to be loaded + @synchronize + + I = (grow - 1) * TILE_DIM + tile_row + J = (gcol - 1) * TILE_DIM + tile_col + + # calculate value of spot in output, use temporary value to allow for vectorization + out = zero(R) + @simd for k in 1:TILE_DIM + @inbounds out += tile1[tile_row, k] * tile2[k, tile_col] + end + outval[1] += out + + @synchronize + end + + I = (grow - 1) * TILE_DIM + tile_row + J = (gcol - 1) * TILE_DIM + tile_col + + # save if inbounds + if I <= N && J <= M + @inbounds output[I, J] = add(outval[1], output[I, J]) + end + end + + coalesced_matmul_kernel!(get_backend(C), (MAX_TILE_DIM, MAX_TILE_DIM))(C, A, B, N, Q, M;ndrange=map(x -> ceil(Int,x/MAX_TILE_DIM)*MAX_TILE_DIM, size(C))) + C +end function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::AbstractArray{S}, add::MulAddMul) where {T,S,R} if size(A,2) != size(B,1) throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) @@ -744,7 +825,7 @@ function LinearAlgebra.kron!(z::AbstractGPUVector{T1}, x::AbstractGPUVector{T2}, @kernel function kron_kernel!(z, @Const(x), @Const(y)) i, j = @index(Global, NTuple) - + @inbounds z[(i - 1) * length(y) + j] = x[i] * y[j] end @@ -777,13 +858,13 @@ for (wrapa, transa, unwrapa) in trans_adj_wrappers, (wrapb, transb, unwrapb) in ta = $transa(T1) tb = $transb(T2) - + @kernel function kron_kernel!(C, @Const(A), @Const(B)) ai, aj = @index(Global, NTuple) # Indices in the result matrix - + # lb1, lb2 = size(B) # Dimensions of B lb1, lb2 = tb == 'N' ? size(B) : reverse(size(B)) - + # Map global indices (ai, aj) to submatrices of the Kronecker product i_a = (ai - 1) ÷ lb1 + 1 # Corresponding row index in A i_b = (ai - 1) % lb1 + 1 # Corresponding row index in B @@ -797,12 +878,12 @@ for (wrapa, transa, unwrapa) in trans_adj_wrappers, (wrapb, transb, unwrapb) in C[ai, aj] = a_ij * b_ij end end - + backend = KernelAbstractions.get_backend(C) kernel = kron_kernel!(backend) - + kernel(C, $(unwrapa(:A)), $(unwrapb(:B)), ndrange=(size(C, 1), size(C, 2))) - + return C end From f61e6bb7493e9879fb24e4074ff667785a36aa4f Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 12 May 2025 19:35:15 -0300 Subject: [PATCH 2/8] KA 0.10 compat --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 95aa3f23..83dce68b 100644 --- a/Project.toml +++ b/Project.toml @@ -18,7 +18,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2" [compat] Adapt = "4.0" GPUArraysCore = "= 0.2.0" -KernelAbstractions = "0.9.28" +KernelAbstractions = "0.10" LLVM = "3.9, 4, 5, 6, 7, 8, 9" LinearAlgebra = "1" Printf = "1" From 1b7728d14f285ee887ca6ddb84e7b45db6230bf5 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Mon, 12 May 2025 19:36:24 -0300 Subject: [PATCH 3/8] Don't test JLArrays --- .github/workflows/Test.yml | 2 +- test/Project.toml | 1 - test/runtests.jl | 2 +- test/setup.jl | 6 +++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 3c04f0f5..8a84b030 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -29,7 +29,7 @@ jobs: julia --project -e " using Pkg Pkg.develop([PackageSpec(; name=basename(path), path) for path in ARGS]) - " lib/GPUArraysCore lib/JLArrays + " lib/GPUArraysCore - uses: julia-actions/julia-runtest@v1 continue-on-error: ${{ matrix.version == 'nightly' }} - uses: julia-actions/julia-processcoverage@v1 diff --git a/test/Project.toml b/test/Project.toml index 4e233cab..ebfa0a4d 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -2,7 +2,6 @@ Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b" -JLArrays = "27aeb0d3-9eb9-45fb-866b-73c2ecf80fcb" KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" diff --git a/test/runtests.jl b/test/runtests.jl index 66d6a096..7a9bb26c 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -47,7 +47,7 @@ include("setup.jl") # make sure everything is precompiled # choose tests const tests = [] const test_runners = Dict() -for AT in (JLArray, Array), name in keys(TestSuite.tests) +for AT in (Array,), name in keys(TestSuite.tests) push!(tests, "$(AT)/$name") test_runners["$(AT)/$name"] = ()->TestSuite.tests[name](AT) end diff --git a/test/setup.jl b/test/setup.jl index 1e06e2f0..df64016e 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -1,4 +1,4 @@ -using Distributed, Test, JLArrays +using Distributed, Test include("testsuite.jl") @@ -15,7 +15,7 @@ function runtests(f, name) # generate a temporary module to execute the tests in mod_name = Symbol("Test", rand(1:100), "Main_", replace(name, '/' => '_')) mod = @eval(Main, module $mod_name end) - @eval(mod, using Test, Random, JLArrays) + @eval(mod, using Test, Random) let id = myid() wait(@spawnat 1 print_testworker_started(name, id)) @@ -24,7 +24,7 @@ function runtests(f, name) ex = quote GC.gc(true) Random.seed!(1) - JLArrays.allowscalar(false) + # JLArrays.allowscalar(false) @timed @testset $"$name" begin $f() From 63aa4642084c8990fac3c48ba7b0c2e4225e8eec Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 13 May 2025 13:27:53 -0300 Subject: [PATCH 4/8] Revert "Faster matmul" This reverts commit 416b28f11e0f4b509e6efca1a06babcd4997d45b. --- src/host/linalg.jl | 95 ++++------------------------------------------ 1 file changed, 7 insertions(+), 88 deletions(-) diff --git a/src/host/linalg.jl b/src/host/linalg.jl index b59598f6..2c928747 100644 --- a/src/host/linalg.jl +++ b/src/host/linalg.jl @@ -325,92 +325,11 @@ function LinearAlgebra.ldiv!(B::AbstractGPUVecOrMat, B end -# XXX: figure out how to do dynamically -MAX_TILE_DIM = 16 ## matrix multiplication # legacy method generic_matmatmul!(C::AbstractArray, A::AbstractArray, B::AbstractArray, a::Number, b::Number) = generic_matmatmul!(C, A, B, MulAddMul(a, b)) -function generic_matmatmul!(C::AbstractGPUMatrix{R}, A::AbstractGPUMatrix{T}, B::AbstractGPUMatrix{S}, add::MulAddMul) where {T<:Number,S<:Number,R<:Number} - N = size(A,1) - Q = size(A,2) - M = size(B,2) - if Q != size(B,1) - throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) - end - if size(C,1) != N || size(C,2) != M - throw(DimensionMismatch("result C has dimensions $(size(C)), needs $((N,M))")) - end - if isempty(A) || isempty(B) - return fill!(C, zero(R)) - end - - @kernel unsafe_indices=true function coalesced_matmul_kernel!( - output, @Const(input1), @Const(input2), N, Q, M, - ::Val{BANK} = Val(1), - ) where {BANK} - grow, gcol = @index(Group, NTuple) - tile_row, tile_col = @index(Local, NTuple) - - TILE_DIM = @uniform @groupsize()[1] - - # +1 to avoid bank conflicts on shared memory - tile1 = @localmem(R, (TILE_DIM + BANK, TILE_DIM)) - tile2 = @localmem(R, (TILE_DIM + BANK, TILE_DIM)) - - # private variable for tile output - outval = @private R 1 - @inbounds outval[1] = -zero(R) - - # number of tiles depends on inner dimension - @uniform NUM_TILES = div(Q + TILE_DIM - 1, TILE_DIM) - - # loop over all tiles needed for this calculation - for t in 0:(NUM_TILES - 1) - I = (grow - 1) * TILE_DIM + tile_row - J = (gcol - 1) * TILE_DIM + tile_col - - # load inputs into tiles, with bounds checking for non-square matrices - if I <= N && t * TILE_DIM + tile_col <= Q - @inbounds tile1[tile_row, tile_col] = input1[I, t * TILE_DIM + tile_col] - else - @inbounds tile1[tile_row, tile_col] = zero(R) - end - if J <= M && t * TILE_DIM + tile_row <= Q - @inbounds tile2[tile_row, tile_col] = input2[t * TILE_DIM + tile_row, J] - else - @inbounds tile2[tile_row, tile_col] = zero(R) - end - - # wait for all tiles to be loaded - @synchronize - - I = (grow - 1) * TILE_DIM + tile_row - J = (gcol - 1) * TILE_DIM + tile_col - - # calculate value of spot in output, use temporary value to allow for vectorization - out = zero(R) - @simd for k in 1:TILE_DIM - @inbounds out += tile1[tile_row, k] * tile2[k, tile_col] - end - outval[1] += out - - @synchronize - end - - I = (grow - 1) * TILE_DIM + tile_row - J = (gcol - 1) * TILE_DIM + tile_col - - # save if inbounds - if I <= N && J <= M - @inbounds output[I, J] = add(outval[1], output[I, J]) - end - end - - coalesced_matmul_kernel!(get_backend(C), (MAX_TILE_DIM, MAX_TILE_DIM))(C, A, B, N, Q, M;ndrange=map(x -> ceil(Int,x/MAX_TILE_DIM)*MAX_TILE_DIM, size(C))) - C -end function generic_matmatmul!(C::AbstractArray{R}, A::AbstractArray{T}, B::AbstractArray{S}, add::MulAddMul) where {T,S,R} if size(A,2) != size(B,1) throw(DimensionMismatch("matrix A has dimensions $(size(A)), matrix B has dimensions $(size(B))")) @@ -825,7 +744,7 @@ function LinearAlgebra.kron!(z::AbstractGPUVector{T1}, x::AbstractGPUVector{T2}, @kernel function kron_kernel!(z, @Const(x), @Const(y)) i, j = @index(Global, NTuple) - + @inbounds z[(i - 1) * length(y) + j] = x[i] * y[j] end @@ -858,13 +777,13 @@ for (wrapa, transa, unwrapa) in trans_adj_wrappers, (wrapb, transb, unwrapb) in ta = $transa(T1) tb = $transb(T2) - + @kernel function kron_kernel!(C, @Const(A), @Const(B)) ai, aj = @index(Global, NTuple) # Indices in the result matrix - + # lb1, lb2 = size(B) # Dimensions of B lb1, lb2 = tb == 'N' ? size(B) : reverse(size(B)) - + # Map global indices (ai, aj) to submatrices of the Kronecker product i_a = (ai - 1) ÷ lb1 + 1 # Corresponding row index in A i_b = (ai - 1) % lb1 + 1 # Corresponding row index in B @@ -878,12 +797,12 @@ for (wrapa, transa, unwrapa) in trans_adj_wrappers, (wrapb, transb, unwrapb) in C[ai, aj] = a_ij * b_ij end end - + backend = KernelAbstractions.get_backend(C) kernel = kron_kernel!(backend) - + kernel(C, $(unwrapa(:A)), $(unwrapb(:B)), ndrange=(size(C, 1), size(C, 2))) - + return C end From 07fbb88f85de7244233e487036f511c66c95ceff Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 13 May 2025 13:31:40 -0300 Subject: [PATCH 5/8] ytd --- .github/workflows/Test.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 8a84b030..7e3a0b87 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -29,6 +29,7 @@ jobs: julia --project -e " using Pkg Pkg.develop([PackageSpec(; name=basename(path), path) for path in ARGS]) + Pkg.develop("KernelAbstractions") " lib/GPUArraysCore - uses: julia-actions/julia-runtest@v1 continue-on-error: ${{ matrix.version == 'nightly' }} From 007466e399de73e0c94f26bd6584f630fe127309 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 13 May 2025 13:39:37 -0300 Subject: [PATCH 6/8] esthr --- test/runtests.jl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index 7a9bb26c..a065e687 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,8 +1,12 @@ +using Pkg +Pkg.develop("KernelAbstractions") + using Distributed using Dates import REPL using Printf: @sprintf + # parse some command-line arguments function extract_flag!(args, flag, default=nothing) for f in args From 5b0794778559ce68448a487b25cf057a9cf9b5ec Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 13 May 2025 22:05:18 -0300 Subject: [PATCH 7/8] hsgff --- .github/workflows/Test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 7e3a0b87..87b7703b 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -28,8 +28,8 @@ jobs: run: | julia --project -e " using Pkg + Pkg.develop(\"KernelAbstractions\") Pkg.develop([PackageSpec(; name=basename(path), path) for path in ARGS]) - Pkg.develop("KernelAbstractions") " lib/GPUArraysCore - uses: julia-actions/julia-runtest@v1 continue-on-error: ${{ matrix.version == 'nightly' }} From 113448c5fce049343ead58be83814e8dc17caf16 Mon Sep 17 00:00:00 2001 From: Christian Guinard <28689358+christiangnrd@users.noreply.github.com> Date: Tue, 13 May 2025 22:08:49 -0300 Subject: [PATCH 8/8] jrgfdsh --- .github/workflows/Test.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index 87b7703b..6bc0b4aa 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -26,11 +26,11 @@ jobs: - uses: julia-actions/cache@v2 - name: Develop subpackages run: | - julia --project -e " + julia --project -e ' using Pkg - Pkg.develop(\"KernelAbstractions\") + Pkg.develop("KernelAbstractions") Pkg.develop([PackageSpec(; name=basename(path), path) for path in ARGS]) - " lib/GPUArraysCore + ' lib/GPUArraysCore - uses: julia-actions/julia-runtest@v1 continue-on-error: ${{ matrix.version == 'nightly' }} - uses: julia-actions/julia-processcoverage@v1