Skip to content

Commit

Permalink
Account for special case of non-decompozed z directions.
Browse files Browse the repository at this point in the history
  • Loading branch information
p-costa committed Jan 17, 2025
1 parent ceae6b5 commit 012febf
Show file tree
Hide file tree
Showing 3 changed files with 121 additions and 107 deletions.
6 changes: 4 additions & 2 deletions src/sanity.f90
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ subroutine test_sanity_input(ng,dims,stop_type,cbcvel,cbcpre,bcvel,bcpre,is_forc
if(myid == 0) print*, 'ERROR: `_IMPDIFF_1D` cpp macro requires building with `_IMPDIFF` too.'; call abortit
#endif
#if defined(_IMPDIFF_1D) && !defined(_DECOMP_Z)
if(myid == 0) print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient &
& when combined with a Z-pencils parallelization (`_DECOMP_Z`).'
if(dims(2) > 1) then
if(myid == 0) print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient &
& when the flow is not decomposed along the Z direction.'
end if
#endif
end subroutine test_sanity_input
!
Expand Down
77 changes: 41 additions & 36 deletions src/solver.f90
Original file line number Diff line number Diff line change
Expand Up @@ -210,57 +210,62 @@ subroutine solver_gaussel_z(n,a,b,c,bcz,c_or_f,p)
character(len=1), dimension(0:1), intent(in) :: bcz
character(len=1), intent(in), dimension(3) :: c_or_f
real(rp), intent(inout), dimension(0:,0:,0:) :: p
#if !defined(_DECOMP_Z)
real(rp), dimension(xsize(1),xsize(2),xsize(3)) :: px
real(rp), dimension(ysize(1),ysize(2),ysize(3)) :: py
real(rp), dimension(zsize(1),zsize(2),zsize(3)) :: pz
#endif
real(rp), allocatable, dimension(:,:,:), save :: px,py,pz
integer :: q
integer, dimension(3) :: n_z
logical :: is_no_decomp_z
!
n_z(:) = zsize(:)
is_no_decomp_z = xsize(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1
if(.not.is_no_decomp_z) then
if(.not.allocated(px)) allocate(px(xsize(1),xsize(2),xsize(3)))
if(.not.allocated(py)) allocate(py(ysize(1),ysize(2),ysize(3)))
if(.not.allocated(pz)) allocate(pz(zsize(1),zsize(2),zsize(3)))
#if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
!$OMP PARALLEL WORKSHARE
px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
!call transpose_x_to_z(px,pz)
call transpose_x_to_y(px,py)
call transpose_y_to_z(py,pz)
!$OMP PARALLEL WORKSHARE
px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
!call transpose_x_to_z(px,pz)
call transpose_x_to_y(px,py)
call transpose_y_to_z(py,pz)
#elif defined(_DECOMP_Y)
!$OMP PARALLEL WORKSHARE
py(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
call transpose_y_to_z(py,pz)
!$OMP PARALLEL WORKSHARE
py(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
call transpose_y_to_z(py,pz)
#endif
end if
q = 0
if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1
#if !defined(_DECOMP_Z)
if(bcz(0)//bcz(1) == 'PP') then
call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
else
call gaussel( n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
end if
#else
if(bcz(0)//bcz(1) == 'PP') then
call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
if(.not.is_no_decomp_z) then
if(bcz(0)//bcz(1) == 'PP') then
call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
else
call gaussel( n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
end if
else
call gaussel( n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
if(bcz(0)//bcz(1) == 'PP') then
call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
else
call gaussel( n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
end if
end if
#endif
!
if(.not.is_no_decomp_z) then
#if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
!call transpose_z_to_x(pz,px)
call transpose_z_to_y(pz,py)
call transpose_y_to_x(py,px)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
!$OMP END PARALLEL WORKSHARE
!call transpose_z_to_x(pz,px)
call transpose_z_to_y(pz,py)
call transpose_y_to_x(py,px)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
!$OMP END PARALLEL WORKSHARE
#elif defined(_DECOMP_Y)
call transpose_z_to_y(pz,py)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = py(:,:,:)
!$OMP END PARALLEL WORKSHARE
call transpose_z_to_y(pz,py)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = py(:,:,:)
!$OMP END PARALLEL WORKSHARE
#endif
end if
end subroutine solver_gaussel_z
!
#if 0
Expand Down
145 changes: 76 additions & 69 deletions src/solver_gpu.f90
Original file line number Diff line number Diff line change
Expand Up @@ -392,54 +392,59 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p)
integer :: q
integer, dimension(3) :: n_x,n_y,n_z,n_z_0
integer :: istat
logical :: is_no_decomp_z
!
n_z_0(:) = ap_z_0%shape(:)
#if !defined(_DECOMP_Z)
n_x(:) = ap_x%shape(:)
n_y(:) = ap_y%shape(:)
n_z(:) = ap_z%shape(:)
px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
if(cudecomp_is_t_in_place) then
py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
else
py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:)))
is_no_decomp_z = n_x(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1
if(.not.is_no_decomp_z) then
px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
if(cudecomp_is_t_in_place) then
py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
else
py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:)))
end if
pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:)))
end if
pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:)))
#endif
select case(ipencil_axis)
case(1)
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc host_data use_device(px,py,pz,work)
istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
!$acc end host_data
case(2)
block
integer :: i,j,k
!
! transpose p -> py to axis-contiguous layout
!
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
py(j,k,i) = p(i,j,k)
!
if(.not.is_no_decomp_z) then
select case(ipencil_axis)
case(1)
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
!$acc host_data use_device(px,py,pz,work)
istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
!$acc end host_data
case(2)
block
integer :: i,j,k
!
! transpose p -> py to axis-contiguous layout
!
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
py(j,k,i) = p(i,j,k)
end do
end do
end do
end do
end block
!$acc host_data use_device(py,pz,work)
istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
!$acc end host_data
case(3)
end select
end block
!$acc host_data use_device(py,pz,work)
istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
!$acc end host_data
case(3)
end select
end if
!
if(ipencil_axis /= 3) then
if(ipencil_axis /= 3 .and. .not.is_no_decomp_z) then
q = 0
if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1
if(bcz(0)//bcz(1) == 'PP') then
Expand All @@ -457,38 +462,40 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p)
end if
end if
!
select case(ipencil_axis)
case(1)
!$acc host_data use_device(pz,py,px,work)
istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
!$acc end host_data
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
case(2)
!$acc host_data use_device(pz,py,work)
istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
!$acc end host_data
block
integer :: i,j,k
!
! transpose py -> p to default layout
!
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
p(i,j,k) = py(j,k,i)
if(.not.is_no_decomp_z) then
select case(ipencil_axis)
case(1)
!$acc host_data use_device(pz,py,px,work)
istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
!$acc end host_data
!$acc kernels default(present) async(1)
!$OMP PARALLEL WORKSHARE
p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
!$OMP END PARALLEL WORKSHARE
!$acc end kernels
case(2)
!$acc host_data use_device(pz,py,work)
istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
!$acc end host_data
block
integer :: i,j,k
!
! transpose py -> p to default layout
!
!$acc parallel loop collapse(3) default(present) async(1)
!$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared)
do k=1,n(3)
do j=1,n(2)
do i=1,n(1)
p(i,j,k) = py(j,k,i)
end do
end do
end do
end do
end block
case(3)
end select
end block
case(3)
end select
end if
end subroutine solver_gaussel_z_gpu
#endif
#endif
Expand Down

0 comments on commit 012febf

Please sign in to comment.