diff --git a/src/array.jl b/src/array.jl
index ff9e2d4916..4f7558daa4 100644
--- a/src/array.jl
+++ b/src/array.jl
@@ -519,7 +519,151 @@ function Base.copyto!(dest::DenseCuArray{T}, doffs::Integer, src::DenseCuArray{T
 end
 
 Base.copyto!(dest::DenseCuArray{T}, src::DenseCuArray{T}) where {T} =
-    copyto!(dest, 1, src, 1, length(src))
+copyto!(dest, 1, src, 1, length(src))
+
+#TO DO: expand this for StridedMatrices of different shapes, currently the src needs to fit in the destination
+#TO DO: add parameters doffs, soffs, n
+
+for (destType,srcType) in ((StridedSubCuArray, SubArray) , (SubArray, StridedSubCuArray), 
+                            (StridedSubCuArray, StridedSubCuArray),
+                            (StridedSubCuArray, Array) ,  (Array, StridedSubCuArray), 
+                            (CuArray, StridedSubCuArray) , ( StridedSubCuArray, CuArray),
+                            (CuArray, SubArray) , (SubArray, CuArray) )
+  @eval begin
+    function Base.copyto!(dest::$destType{T,2},src::$srcType{T,2}, Copy2D::Bool=false) where {T} 
+      if (dest isa StridedSubCuArray) || (dest isa SubArray)
+        dest_index1=findfirst((typeof.(dest.indices) .<: Int).==0)
+        dest_index2=findnext((typeof.(dest.indices) .<: Int).==0, dest_index1+1)
+        dest_step_x=step(dest.indices[dest_index1])
+        dest_step_height=step(dest.indices[dest_index2])
+        dest_parent_size=size(parent(dest))
+      else
+        dest_index1=1
+        dest_index2=2
+        dest_step_x=1
+        dest_step_height=1
+        dest_parent_size=size(dest)
+      end
+      if (src isa StridedSubCuArray) || (src isa SubArray)
+        src_index1=findfirst((typeof.(src.indices) .<: Int).==0)
+        src_index2=findnext((typeof.(src.indices) .<: Int).==0, src_index1+1)
+        src_step_x=step(src.indices[src_index1])
+        src_step_height=step(src.indices[src_index2])
+        src_parent_size=size(parent(src)) 
+      else
+        src_index1=1
+        src_index2=2
+        src_step_x=1
+        src_step_height=1
+        src_parent_size=size(src) 
+      end
+
+      dest_pitch1= (dest_index1==1) ? 1 :  prod(dest_parent_size[1:(dest_index1-1)])
+      dest_pitch2=  prod(dest_parent_size[dest_index1:(dest_index2-1)])
+      src_pitch1= (src_index1==1) ? 1 :  prod(src_parent_size[1:(src_index1-1)])
+      src_pitch2= prod(src_parent_size[src_index1:(src_index2-1)])
+      destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
+      srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
+      @boundscheck checkbounds(1:size(dest, 1), 1:size(src,1))
+      @boundscheck checkbounds(1:size(dest, 2), 1:size(src,2))
+      
+      if (size(dest,1)==size(src,1) || (Copy2D))
+      #Non-contigous views can be accomodated by copy3d in certain cases
+        if isinteger(src_pitch2*src_step_height/src_step_x/src_pitch1) && isinteger(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1) 
+          Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
+                                    1, size(src,1), size(src,2);
+                                    srcPos=(1,1,1), dstPos=(1,1,1),
+                                    srcPitch=src_step_x*sizeof(T)*src_pitch1,srcHeight=Int(src_pitch2*src_step_height/src_step_x/src_pitch1),
+                                    dstPitch=dest_step_x*sizeof(T)*dest_pitch1, dstHeight=Int(dest_pitch2*dest_step_height/dest_step_x/dest_pitch1))
+        #In other cases, use parallel threads
+        else
+          CUDA.synchronize()
+          Base.@sync for col in 1:length(src.indices[src_index2])
+            Threads.@spawn begin
+              Mem.unsafe_copy3d!(pointer(view(dest,:,col)),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                  1, 1, size(src,1);
+                                  srcPos=(1,1,1), dstPos=(1,1,1),
+                                  srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                  dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+              CUDA.synchronize()
+            end
+          end
+        end
+      else  #Ensure same behavior as Base copying from smaller to bigger matrix if copy2D is false
+        start_indices=(1:size(src,1):size(src,1)*(size(src,2)+1))
+        dest_col=div.(start_indices.-1,size(dest,1)).+1
+        start_indices=mod.(start_indices,size(dest,1))
+        replace!(start_indices,0=>size(dest,1))
+        split_col=start_indices[1:end-1].>start_indices[2:end]
+
+        CUDA.synchronize()
+        Base.@sync for col in 1:length(src.indices[src_index2])
+          Threads.@spawn begin
+            n= split_col[col] ? (size(dest,1)-start_indices[col]+1) : size(src,1)
+            Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col])),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                1, 1, n;
+                                srcPos=(1,1,1), dstPos=(1,1,start_indices[col]),
+                                srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+            if split_col[col]
+              Mem.unsafe_copy3d!(pointer(view(dest,:,dest_col[col]+1)),destLocation, pointer(view(src,:,col)),  srcLocation,
+                                1, 1, size(src,1)-n;
+                                srcPos=(1,1,n+1), dstPos=(1,1,1),
+                                srcPitch=sizeof(T)*src_step_x*src_pitch1,srcHeight=1,
+                                dstPitch=sizeof(T)*dest_step_x*dest_pitch1, dstHeight=1)
+            end
+            CUDA.synchronize()
+          end
+        end
+      end
+
+      return dest
+    end
+
+    function Base.copyto!(dest::$destType{T,1},doffs::Integer,src::$srcType{T,1},  soffs::Integer,
+                                  n::Integer) where {T} 
+      n==0 && return dest
+      @boundscheck checkbounds(dest, doffs)
+      @boundscheck checkbounds(dest, doffs+n-1)
+      @boundscheck checkbounds(src, soffs)
+      @boundscheck checkbounds(src, soffs+n-1)
+      if (dest isa StridedSubCuArray) || (dest isa SubArray)
+        dest_index=findfirst((typeof.(dest.indices) .<: Int).==0)
+        dest_step=step(dest.indices[dest_index])
+        dest_pitch=(dest_index==1) ? 1 : prod(size(parent(dest))[1:(dest_index-1)])
+      else
+        dest_index=1
+        dest_step=1
+        dest_pitch=1
+      end
+
+      if (src isa StridedSubCuArray) || (src isa SubArray)
+        src_index=findfirst((typeof.(src.indices) .<: Int).==0)
+        src_step=step(src.indices[src_index])
+        src_pitch= (src_index==1) ? 1 : prod(size(parent(src))[1:(src_index-1)])
+      else
+        src_index=1
+        src_step=1
+        src_pitch=1
+      end
+      destLocation= ((dest isa StridedSubCuArray) || (dest isa CuArray)) ? Mem.Device : Mem.Host
+      srcLocation= ((src isa StridedSubCuArray) || (src isa CuArray)) ? Mem.Device : Mem.Host
+
+      Mem.unsafe_copy3d!(pointer(dest), destLocation, pointer(src), srcLocation,
+                                1, 1, n;
+                                srcPos=(1,1,soffs), dstPos=(1,1,doffs),
+                                srcPitch=src_step*sizeof(T)*src_pitch,srcHeight=1,
+                                dstPitch=dest_step*sizeof(T)*dest_pitch, dstHeight=1)
+      return dest
+    end
+
+
+
+    Base.copyto!(dest::$destType{T}, src::$srcType{T}) where {T} =
+      copyto!(dest, 1, src, 1, length(src))
+
+  end
+end
 
 # general case: use CUDA APIs
 
diff --git a/test/base/array.jl b/test/base/array.jl
index 9fb511f7c8..87488f8470 100644
--- a/test/base/array.jl
+++ b/test/base/array.jl
@@ -368,6 +368,140 @@ end
   @test view(b, :, 1, :) isa StridedCuArray
 end
 
+@testset "elty = $elty" for elty in [Float32, Float64, ComplexF32, ComplexF64]
+  @testset "copyto StridedCuArray" begin
+    n=17
+    m=11
+    k=23
+    l=19
+    range1=2:3:11
+    range2=3:2:11
+    range3=1:5:16
+    range4=4:4:20
+
+    #From GPU to CPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    cpu_view= view(cpu_matrix, range3, range4)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, :, :)
+    cpu_view= view(cpu_matrix,1:m, 1:n)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, range1)
+    cpu_view= view(cpu_vec,range3)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    cpu_view= view(cpu_vec,1:m)
+    copyto!(cpu_view,gpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    #From CPU to GPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    cpu_view= view(cpu_matrix, range3, range4)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    cpu_matrix = rand(elty,l,k)
+    gpu_view= view(gpu_matrix, :, :)
+    cpu_view= view(cpu_matrix,1:m, 1:n)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, range1)
+    cpu_view= view(cpu_vec,range3)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    gpu_vec = CUDA.rand(elty, m)
+    cpu_vec = rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    cpu_view= view(cpu_vec,1:m)
+    copyto!(gpu_view,cpu_view)
+    @test collect(gpu_view) == cpu_view
+
+    #From GPU to GPU
+    gpu_matrix = CUDA.rand(elty, m,n)
+    gpu_matrix2 = CUDA.rand(elty,l,k)
+    gpu_view= view(gpu_matrix,range1 , range2)
+    gpu_view2= view(gpu_matrix2,range3, range4)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == collect(gpu_view2)
+
+    gpu_matrix = CUDA.rand(elty, m,n)
+    gpu_matrix2 = CUDA.rand(elty,l,k)
+    gpu_view= view(gpu_matrix,:, :)
+    gpu_view2= view(gpu_matrix2,1:m, 1:n)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == collect(gpu_view2)
+
+    gpu_vec = CUDA.rand(elty, m)
+    gpu_vec2 = CUDA.rand(elty,l)
+    gpu_view= view(gpu_vec, range1)
+    gpu_view2= view(gpu_vec2, range3)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == collect(gpu_view2)
+
+    gpu_vec = CUDA.rand(elty, m)
+    gpu_vec2 = CUDA.rand(elty,l)
+    gpu_view= view(gpu_vec, :)
+    gpu_view2= view(gpu_vec2,1:m)
+    copyto!(gpu_view,gpu_view2)
+    @test collect(gpu_view) == collect(gpu_view2)
+
+    #testing higher dimensional views
+
+    for gpu_indices in ( (range1, range2, 3, 7) , (range1, 3, range2, 7), 
+                        (range1, 3, 7, range2), (3, range1, range2, 7),  
+                        (3, range1, 7, range2), (3,7, range1, range2) )
+      for cpu_indices in ( (range3, range4, 11, 5) , (range3, 11, range4, 5), 
+                          (range3, 11, 5, range4), (11, range3, range4, 5),  
+                         (11, range3, 5, range4), (11,5, range3, range4)   )
+        gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
+        cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
+        gpu_view= view(gpu_matrix, gpu_indices...)
+        cpu_view= view(cpu_matrix, cpu_indices...) 
+        copyto!(gpu_view,cpu_view)
+        @test collect(gpu_view) == cpu_view
+
+      end
+    end
+    
+    for gpu_indices in ( (range1, 13, 3, 7) , (3, range1, 7, 13), 
+                        (3,7, range1, 13),  (3,7, 13, range1))
+      for cpu_indices in ( (range3, 11, 2, 5) , (3, range3, 2, 11), 
+                          (2,5, range3, 11),  (2,5, 11, range3))
+        gpu_matrix = CUDA.rand(elty, m*3,n*3, k*3,l*3)
+        cpu_matrix = rand(elty,m*2,n*2, k*2, l*2)
+        gpu_view= view(gpu_matrix, gpu_indices...)
+        cpu_view= view(cpu_matrix, cpu_indices...) 
+        copyto!(gpu_view,cpu_view)
+        @test collect(gpu_view) == cpu_view
+
+      end
+    end
+
+
+  end
+end
+
 @testset "accumulate" begin
   for n in (0, 1, 2, 3, 10, 10_000, 16384, 16384+1) # small, large, odd & even, pow2 and not
     @test testf(x->accumulate(+, x), rand(n))