Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding copyto for non-contiguous matrices and vectors #1778

Draft
wants to merge 22 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
65aabfd
Adding copyto for non-contigous matrices and vectors
evelyne-ringoot Feb 22, 2023
e1f6c1b
Removing typo
evelyne-ringoot Feb 22, 2023
d3e93a7
Resolving weird syntax that caused compilation error
evelyne-ringoot Feb 22, 2023
f2f4d17
Resolving syntax issue and typo
evelyne-ringoot Feb 24, 2023
1fa812e
Typo
evelyne-ringoot Feb 24, 2023
e73bdfd
Testing build without sync
evelyne-ringoot Feb 24, 2023
52f8e29
Fixing function name to match copyto Base function
evelyne-ringoot Feb 28, 2023
c9030f4
Adding support for mixed views non full arrays copyto
evelyne-ringoot Feb 28, 2023
1376be6
Typos in tests
evelyne-ringoot Feb 28, 2023
84d9b4b
Adding support for 1D and 2D views of multi-dimensional arrays
evelyne-ringoot Feb 28, 2023
85b3db3
typos
evelyne-ringoot Feb 28, 2023
0650821
Changing copyto!(B,A), A>B, from 2Dcopy to the vectorcopy Base behavior
evelyne-ringoot Mar 1, 2023
fdcd875
Fixing scalar indexing in test comparisons
evelyne-ringoot Mar 1, 2023
024673f
Adding support for views of length 1
evelyne-ringoot Mar 1, 2023
7029f2e
typo
evelyne-ringoot Mar 1, 2023
4022408
Adding parallelization of copyto
evelyne-ringoot Mar 1, 2023
b545545
Removing spaces
evelyne-ringoot Mar 1, 2023
64d3ece
Resolving compilation issues
evelyne-ringoot Mar 1, 2023
3af67c4
Merge branch 'master' into copyto_views
evelyne-ringoot Sep 5, 2023
3bd7166
Merge branch 'master' into copyto_views
evelyne-ringoot Sep 11, 2023
7f74a0b
Merge branch 'master' into copyto_views
evelyne-ringoot Sep 19, 2023
d222dae
Merge branch 'JuliaGPU:master' into copyto_views
evelyne-ringoot Aug 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 145 additions & 1 deletion src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -519,7 +519,151 @@ function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::DenseCuArray{T
end

Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} =
copyto!(dest, 1, src, 1, length(src))
copyto!(dest, 1, src, 1, length(src))

#TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
#TO DO: add parameters doffs, soffs, n

for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSubCuArray),
(StridedSubCuArray, StridedSubCuArray),
(StridedSubCuArray, Array) , (Array, StridedSubCuArray),
(CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray),
(CuArray, SubArray) , (SubArray, CuArray) )
@eval begin
function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T}
if (dest isa StridedSubCuArray) || (dest isa SubArray)
dest_index1=findfirst((typeof.(dest.indices) .<: Int).==0)
dest_index2=findnext((typeof.(dest.indices) .<: Int).==0, dest_index1+1)
dest_step_x=step(dest.indices[dest_index1])
dest_step_height=step(dest.indices[dest_index2])
dest_parent_size=size(parent(dest))
else
dest_index1=1
dest_index2=2
dest_step_x=1
dest_step_height=1
dest_parent_size=size(dest)
end
if (src isa StridedSubCuArray) || (src isa SubArray)
src_index1=findfirst((typeof.(src.indices) .<: Int).==0)
src_index2=findnext((typeof.(src.indices) .<: Int).==0, src_index1+1)
src_step_x=step(src.indices[src_index1])
src_step_height=step(src.indices[src_index2])
src_parent_size=size(parent(src))
else
src_index1=1
src_index2=2
src_step_x=1
src_step_height=1
src_parent_size=size(src)
end

dest_pitch1= (dest_index1==1) ? 1 : prod(dest_parent_size[1:(dest_index1-1)])
dest_pitch2= prod(dest_parent_size[dest_index1:(dest_index2-1)])
src_pitch1= (src_index1==1) ? 1 : prod(src_parent_size[1:(src_index1-1)])
src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
@boundscheck checkbounds(1:size(dest, 1), 1:size(src,1))
@boundscheck checkbounds(1:size(dest, 2), 1:size(src,2))

if (size(dest,1)==size(src,1) || (Copy2D))
#Non-contigous views can be accomodated by copy3d in certain cases
if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1)
Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
1, size(src,1), size(src,2);
srcPos=(1,1,1), dstPos=(1,1,1),
srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1),
dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1))
#In other cases, use parallel threads
else
CUDA.synchronize()
Base.@sync for col in 1:length(src.indices[src_index2])
Threads.@spawn begin
Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)), srcLocation,
1, 1, size(src,1);
srcPos=(1,1,1), dstPos=(1,1,1),
srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
CUDA.synchronize()
end
end
end
else #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false
start_indices=(1:size(src,1):size(src,1)*(size(src,2)+1))
dest_col=div.(start_indices.-1,size(dest,1)).+1
start_indices=mod.(start_indices,size(dest,1))
replace!(start_indices,0=>size(dest,1))
split_col=start_indices[1:end-1].>start_indices[2:end]

CUDA.synchronize()
Base.@sync for col in 1:length(src.indices[src_index2])
Threads.@spawn begin
n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)), srcLocation,
1, 1, n;
srcPos=(1,1,1), dstPos=(1,1,start_indices[col]),
srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
if split_col[col]
Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col]+1)),destLocation, pointer(view(src,:,col)), srcLocation,
1, 1, size(src,1)-n;
srcPos=(1,1,n+1), dstPos=(1,1,1),
srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
end
CUDA.synchronize()
end
end
end

return dest
end

function Base.copyto!(dest::$destType{T,1},doffs::Integer,src::$srcType{T,1}, soffs::Integer,
n::Integer) where {T}
n==0 && return dest
@boundscheck checkbounds(dest, doffs)
@boundscheck checkbounds(dest, doffs+n-1)
@boundscheck checkbounds(src, soffs)
@boundscheck checkbounds(src, soffs+n-1)
if (dest isa StridedSubCuArray) || (dest isa SubArray)
dest_index=findfirst((typeof.(dest.indices) .<: Int).==0)
dest_step=step(dest.indices[dest_index])
dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)])
else
dest_index=1
dest_step=1
dest_pitch=1
end

if (src isa StridedSubCuArray) || (src isa SubArray)
src_index=findfirst((typeof.(src.indices) .<: Int).==0)
src_step=step(src.indices[src_index])
src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)])
else
src_index=1
src_step=1
src_pitch=1
end
destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host

Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
1, 1, n;
srcPos=(1,1,soffs), dstPos=(1,1,doffs),
srcPitch=src_step*sizeof(T)*src_pitch,srcHeight=1,
dstPitch=dest_step*sizeof(T)*dest_pitch, dstHeight=1)
return dest
end



Base.copyto!(dest::$destType{T}, src::$srcType{T}) where {T} =
copyto!(dest, 1, src, 1, length(src))

end
end

# general case: use CUDA APIs

Expand Down
134 changes: 134 additions & 0 deletions test/base/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,140 @@ end
@test view(b, :, 1, :) isa StridedCuArray
end

@testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
@testset "copyto StridedCuArray" begin
n=17
m=11
k=23
l=19
range1=2:3:11
range2=3:2:11
range3=1:5:16
range4=4:4:20

#From GPU to CPU
gpu_matrix = CUDA.rand(elty, m,n)
cpu_matrix = rand(elty,l,k)
gpu_view= view(gpu_matrix,range1 , range2)
cpu_view= view(cpu_matrix, range3, range4)
copyto!(cpu_view,gpu_view)
@test collect(gpu_view) == cpu_view

gpu_matrix = CUDA.rand(elty, m,n)
cpu_matrix = rand(elty,l,k)
gpu_view= view(gpu_matrix, :, :)
cpu_view= view(cpu_matrix,1:m, 1:n)
copyto!(cpu_view,gpu_view)
@test collect(gpu_view) == cpu_view

gpu_vec = CUDA.rand(elty, m)
cpu_vec = rand(elty,l)
gpu_view= view(gpu_vec, range1)
cpu_view= view(cpu_vec,range3)
copyto!(cpu_view,gpu_view)
@test collect(gpu_view) == cpu_view

gpu_vec = CUDA.rand(elty, m)
cpu_vec = rand(elty,l)
gpu_view= view(gpu_vec, :)
cpu_view= view(cpu_vec,1:m)
copyto!(cpu_view,gpu_view)
@test collect(gpu_view) == cpu_view

#From CPU to GPU
gpu_matrix = CUDA.rand(elty, m,n)
cpu_matrix = rand(elty,l,k)
gpu_view= view(gpu_matrix,range1 , range2)
cpu_view= view(cpu_matrix, range3, range4)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

gpu_matrix = CUDA.rand(elty, m,n)
cpu_matrix = rand(elty,l,k)
gpu_view= view(gpu_matrix, :, :)
cpu_view= view(cpu_matrix,1:m, 1:n)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

gpu_vec = CUDA.rand(elty, m)
cpu_vec = rand(elty,l)
gpu_view= view(gpu_vec, range1)
cpu_view= view(cpu_vec,range3)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

gpu_vec = CUDA.rand(elty, m)
cpu_vec = rand(elty,l)
gpu_view= view(gpu_vec, :)
cpu_view= view(cpu_vec,1:m)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

#From GPU to GPU
gpu_matrix = CUDA.rand(elty, m,n)
gpu_matrix2 = CUDA.rand(elty,l,k)
gpu_view= view(gpu_matrix,range1 , range2)
gpu_view2= view(gpu_matrix2,range3, range4)
copyto!(gpu_view,gpu_view2)
@test collect(gpu_view) == collect(gpu_view2)

gpu_matrix = CUDA.rand(elty, m,n)
gpu_matrix2 = CUDA.rand(elty,l,k)
gpu_view= view(gpu_matrix,:, :)
gpu_view2= view(gpu_matrix2,1:m, 1:n)
copyto!(gpu_view,gpu_view2)
@test collect(gpu_view) == collect(gpu_view2)

gpu_vec = CUDA.rand(elty, m)
gpu_vec2 = CUDA.rand(elty,l)
gpu_view= view(gpu_vec, range1)
gpu_view2= view(gpu_vec2, range3)
copyto!(gpu_view,gpu_view2)
@test collect(gpu_view) == collect(gpu_view2)

gpu_vec = CUDA.rand(elty, m)
gpu_vec2 = CUDA.rand(elty,l)
gpu_view= view(gpu_vec, :)
gpu_view2= view(gpu_vec2,1:m)
copyto!(gpu_view,gpu_view2)
@test collect(gpu_view) == collect(gpu_view2)

#testing higher dimensional views

for gpu_indices in ( (range1, range2, 3, 7) , (range1, 3, range2, 7),
(range1, 3, 7, range2), (3, range1, range2, 7),
(3, range1, 7, range2), (3,7, range1, range2) )
for cpu_indices in ( (range3, range4, 11, 5) , (range3, 11, range4, 5),
(range3, 11, 5, range4), (11, range3, range4, 5),
(11, range3, 5, range4), (11,5, range3, range4) )
gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
gpu_view= view(gpu_matrix, gpu_indices...)
cpu_view= view(cpu_matrix, cpu_indices...)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

end
end

for gpu_indices in ( (range1, 13, 3, 7) , (3, range1, 7, 13),
(3,7, range1, 13), (3,7, 13, range1))
for cpu_indices in ( (range3, 11, 2, 5) , (3, range3, 2, 11),
(2,5, range3, 11), (2,5, 11, range3))
gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
gpu_view= view(gpu_matrix, gpu_indices...)
cpu_view= view(cpu_matrix, cpu_indices...)
copyto!(gpu_view,cpu_view)
@test collect(gpu_view) == cpu_view

end
end


end
end

@testset "accumulate" begin
for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not
@test testf(x->accumulate(+, x), rand(n))
Expand Down