diff --git a/src/sanity.f90 b/src/sanity.f90 index c9182e52..a0b83d4d 100644 --- a/src/sanity.f90 +++ b/src/sanity.f90 @@ -52,8 +52,10 @@ subroutine test_sanity_input(ng,dims,stop_type,cbcvel,cbcpre,bcvel,bcpre,is_forc if(myid == 0) print*, 'ERROR: `_IMPDIFF_1D` cpp macro requires building with `_IMPDIFF` too.'; call abortit #endif #if defined(_IMPDIFF_1D) && !defined(_DECOMP_Z) - if(myid == 0) print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient & - & when combined with a Z-pencils parallelization (`_DECOMP_Z`).' + if(dims(2) > 1) then + if(myid == 0) print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient & + & when the flow is not decomposed along the Z direction.' + end if #endif end subroutine test_sanity_input ! diff --git a/src/solver.f90 b/src/solver.f90 index ba37c1ce..24c34a3d 100644 --- a/src/solver.f90 +++ b/src/solver.f90 @@ -210,57 +210,62 @@ subroutine solver_gaussel_z(n,a,b,c,bcz,c_or_f,p) character(len=1), dimension(0:1), intent(in) :: bcz character(len=1), intent(in), dimension(3) :: c_or_f real(rp), intent(inout), dimension(0:,0:,0:) :: p -#if !defined(_DECOMP_Z) - real(rp), dimension(xsize(1),xsize(2),xsize(3)) :: px - real(rp), dimension(ysize(1),ysize(2),ysize(3)) :: py - real(rp), dimension(zsize(1),zsize(2),zsize(3)) :: pz -#endif + real(rp), allocatable, dimension(:,:,:), save :: px,py,pz integer :: q integer, dimension(3) :: n_z + logical :: is_no_decomp_z ! n_z(:) = zsize(:) + is_no_decomp_z = xsize(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1 + if(.not.is_no_decomp_z) then + if(.not.allocated(px)) allocate(px(xsize(1),xsize(2),xsize(3))) + if(.not.allocated(py)) allocate(py(ysize(1),ysize(2),ysize(3))) + if(.not.allocated(pz)) allocate(pz(zsize(1),zsize(2),zsize(3))) #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z) - !$OMP PARALLEL WORKSHARE - px(:,:,:) = p(1:n(1),1:n(2),1:n(3)) - !$OMP END PARALLEL WORKSHARE - !call transpose_x_to_z(px,pz) - call transpose_x_to_y(px,py) - call transpose_y_to_z(py,pz) + !$OMP PARALLEL WORKSHARE + px(:,:,:) = p(1:n(1),1:n(2),1:n(3)) + !$OMP END PARALLEL WORKSHARE + !call transpose_x_to_z(px,pz) + call transpose_x_to_y(px,py) + call transpose_y_to_z(py,pz) #elif defined(_DECOMP_Y) - !$OMP PARALLEL WORKSHARE - py(:,:,:) = p(1:n(1),1:n(2),1:n(3)) - !$OMP END PARALLEL WORKSHARE - call transpose_y_to_z(py,pz) + !$OMP PARALLEL WORKSHARE + py(:,:,:) = p(1:n(1),1:n(2),1:n(3)) + !$OMP END PARALLEL WORKSHARE + call transpose_y_to_z(py,pz) #endif + end if q = 0 if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1 -#if !defined(_DECOMP_Z) - if(bcz(0)//bcz(1) == 'PP') then - call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz) - else - call gaussel( n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz) - end if -#else - if(bcz(0)//bcz(1) == 'PP') then - call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p) + if(.not.is_no_decomp_z) then + if(bcz(0)//bcz(1) == 'PP') then + call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz) + else + call gaussel( n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz) + end if else - call gaussel( n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p) + if(bcz(0)//bcz(1) == 'PP') then + call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p) + else + call gaussel( n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p) + end if end if -#endif ! + if(.not.is_no_decomp_z) then #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z) - !call transpose_z_to_x(pz,px) - call transpose_z_to_y(pz,py) - call transpose_y_to_x(py,px) - !$OMP PARALLEL WORKSHARE - p(1:n(1),1:n(2),1:n(3)) = px(:,:,:) - !$OMP END PARALLEL WORKSHARE + !call transpose_z_to_x(pz,px) + call transpose_z_to_y(pz,py) + call transpose_y_to_x(py,px) + !$OMP PARALLEL WORKSHARE + p(1:n(1),1:n(2),1:n(3)) = px(:,:,:) + !$OMP END PARALLEL WORKSHARE #elif defined(_DECOMP_Y) - call transpose_z_to_y(pz,py) - !$OMP PARALLEL WORKSHARE - p(1:n(1),1:n(2),1:n(3)) = py(:,:,:) - !$OMP END PARALLEL WORKSHARE + call transpose_z_to_y(pz,py) + !$OMP PARALLEL WORKSHARE + p(1:n(1),1:n(2),1:n(3)) = py(:,:,:) + !$OMP END PARALLEL WORKSHARE #endif + end if end subroutine solver_gaussel_z ! #if 0 diff --git a/src/solver_gpu.f90 b/src/solver_gpu.f90 index 4d80b5dd..5b34c634 100644 --- a/src/solver_gpu.f90 +++ b/src/solver_gpu.f90 @@ -392,54 +392,59 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p) integer :: q integer, dimension(3) :: n_x,n_y,n_z,n_z_0 integer :: istat + logical :: is_no_decomp_z ! n_z_0(:) = ap_z_0%shape(:) -#if !defined(_DECOMP_Z) n_x(:) = ap_x%shape(:) n_y(:) = ap_y%shape(:) n_z(:) = ap_z%shape(:) - px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:))) - if(cudecomp_is_t_in_place) then - py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:))) - else - py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:))) + is_no_decomp_z = n_x(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1 + if(.not.is_no_decomp_z) then + px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:))) + if(cudecomp_is_t_in_place) then + py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:))) + else + py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:))) + end if + pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:))) end if - pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:))) -#endif - select case(ipencil_axis) - case(1) - !$acc kernels default(present) async(1) - !$OMP PARALLEL WORKSHARE - px(:,:,:) = p(1:n(1),1:n(2),1:n(3)) - !$OMP END PARALLEL WORKSHARE - !$acc end kernels - !$acc host_data use_device(px,py,pz,work) - istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) - istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) - !$acc end host_data - case(2) - block - integer :: i,j,k - ! - ! transpose p -> py to axis-contiguous layout - ! - !$acc parallel loop collapse(3) default(present) async(1) - !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) - do k=1,n(3) - do j=1,n(2) - do i=1,n(1) - py(j,k,i) = p(i,j,k) + ! + if(.not.is_no_decomp_z) then + select case(ipencil_axis) + case(1) + !$acc kernels default(present) async(1) + !$OMP PARALLEL WORKSHARE + px(:,:,:) = p(1:n(1),1:n(2),1:n(3)) + !$OMP END PARALLEL WORKSHARE + !$acc end kernels + !$acc host_data use_device(px,py,pz,work) + istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream) + istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) + !$acc end host_data + case(2) + block + integer :: i,j,k + ! + ! transpose p -> py to axis-contiguous layout + ! + !$acc parallel loop collapse(3) default(present) async(1) + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) + do k=1,n(3) + do j=1,n(2) + do i=1,n(1) + py(j,k,i) = p(i,j,k) + end do end do end do - end do - end block - !$acc host_data use_device(py,pz,work) - istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) - !$acc end host_data - case(3) - end select + end block + !$acc host_data use_device(py,pz,work) + istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream) + !$acc end host_data + case(3) + end select + end if ! - if(ipencil_axis /= 3) then + if(ipencil_axis /= 3 .and. .not.is_no_decomp_z) then q = 0 if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1 if(bcz(0)//bcz(1) == 'PP') then @@ -457,38 +462,40 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p) end if end if ! - select case(ipencil_axis) - case(1) - !$acc host_data use_device(pz,py,px,work) - istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) - istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) - !$acc end host_data - !$acc kernels default(present) async(1) - !$OMP PARALLEL WORKSHARE - p(1:n(1),1:n(2),1:n(3)) = px(:,:,:) - !$OMP END PARALLEL WORKSHARE - !$acc end kernels - case(2) - !$acc host_data use_device(pz,py,work) - istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) - !$acc end host_data - block - integer :: i,j,k - ! - ! transpose py -> p to default layout - ! - !$acc parallel loop collapse(3) default(present) async(1) - !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) - do k=1,n(3) - do j=1,n(2) - do i=1,n(1) - p(i,j,k) = py(j,k,i) + if(.not.is_no_decomp_z) then + select case(ipencil_axis) + case(1) + !$acc host_data use_device(pz,py,px,work) + istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) + istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream) + !$acc end host_data + !$acc kernels default(present) async(1) + !$OMP PARALLEL WORKSHARE + p(1:n(1),1:n(2),1:n(3)) = px(:,:,:) + !$OMP END PARALLEL WORKSHARE + !$acc end kernels + case(2) + !$acc host_data use_device(pz,py,work) + istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream) + !$acc end host_data + block + integer :: i,j,k + ! + ! transpose py -> p to default layout + ! + !$acc parallel loop collapse(3) default(present) async(1) + !$OMP PARALLEL DO COLLAPSE(3) DEFAULT(shared) + do k=1,n(3) + do j=1,n(2) + do i=1,n(1) + p(i,j,k) = py(j,k,i) + end do end do end do - end do - end block - case(3) - end select + end block + case(3) + end select + end if end subroutine solver_gaussel_z_gpu #endif #endif