diff --git a/src/sanity.f90 b/src/sanity.f90
index c9182e52..a0b83d4d 100644
--- a/src/sanity.f90
+++ b/src/sanity.f90
@@ -52,8 +52,10 @@ subroutine test_sanity_input(ng,dims,stop_type,cbcvel,cbcpre,bcvel,bcpre,is_forc
     if(myid == 0)  print*, 'ERROR: `_IMPDIFF_1D` cpp macro requires building with `_IMPDIFF` too.'; call abortit
 #endif
 #if defined(_IMPDIFF_1D) && !defined(_DECOMP_Z)
-    if(myid == 0)  print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient &
-                                   & when combined with a Z-pencils parallelization (`_DECOMP_Z`).'
+    if(dims(2) > 1) then
+      if(myid == 0)  print*, 'WARNING: a run with implicit Z diffusion (`_IMPDIFF_1D`) is much more efficient &
+                                     & when the flow is not decomposed along the Z direction.'
+    end if
 #endif
   end subroutine test_sanity_input
   !
diff --git a/src/solver.f90 b/src/solver.f90
index ba37c1ce..24c34a3d 100644
--- a/src/solver.f90
+++ b/src/solver.f90
@@ -210,57 +210,62 @@ subroutine solver_gaussel_z(n,a,b,c,bcz,c_or_f,p)
     character(len=1), dimension(0:1), intent(in) :: bcz
     character(len=1), intent(in), dimension(3) :: c_or_f
     real(rp), intent(inout), dimension(0:,0:,0:) :: p
-#if !defined(_DECOMP_Z)
-    real(rp), dimension(xsize(1),xsize(2),xsize(3)) :: px
-    real(rp), dimension(ysize(1),ysize(2),ysize(3)) :: py
-    real(rp), dimension(zsize(1),zsize(2),zsize(3)) :: pz
-#endif
+    real(rp), allocatable, dimension(:,:,:), save :: px,py,pz
     integer :: q
     integer, dimension(3) :: n_z
+    logical :: is_no_decomp_z
     !
     n_z(:) = zsize(:)
+    is_no_decomp_z = xsize(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1
+    if(.not.is_no_decomp_z) then
+      if(.not.allocated(px)) allocate(px(xsize(1),xsize(2),xsize(3)))
+      if(.not.allocated(py)) allocate(py(ysize(1),ysize(2),ysize(3)))
+      if(.not.allocated(pz)) allocate(pz(zsize(1),zsize(2),zsize(3)))
 #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
-    !$OMP PARALLEL WORKSHARE
-    px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
-    !$OMP END PARALLEL WORKSHARE
-    !call transpose_x_to_z(px,pz)
-    call transpose_x_to_y(px,py)
-    call transpose_y_to_z(py,pz)
+      !$OMP PARALLEL WORKSHARE
+      px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
+      !$OMP END PARALLEL WORKSHARE
+      !call transpose_x_to_z(px,pz)
+      call transpose_x_to_y(px,py)
+      call transpose_y_to_z(py,pz)
 #elif defined(_DECOMP_Y)
-    !$OMP PARALLEL WORKSHARE
-    py(:,:,:) = p(1:n(1),1:n(2),1:n(3))
-    !$OMP END PARALLEL WORKSHARE
-    call transpose_y_to_z(py,pz)
+      !$OMP PARALLEL WORKSHARE
+      py(:,:,:) = p(1:n(1),1:n(2),1:n(3))
+      !$OMP END PARALLEL WORKSHARE
+      call transpose_y_to_z(py,pz)
 #endif
+    end if
     q = 0
     if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1
-#if !defined(_DECOMP_Z)
-    if(bcz(0)//bcz(1) == 'PP') then
-      call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
-    else
-      call gaussel(         n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
-    end if
-#else
-    if(bcz(0)//bcz(1) == 'PP') then
-      call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
+    if(.not.is_no_decomp_z) then
+      if(bcz(0)//bcz(1) == 'PP') then
+        call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
+      else
+        call gaussel(         n_z(1),n_z(2),n_z(3)-q,0,a,b,c,pz)
+      end if
     else
-      call gaussel(         n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
+      if(bcz(0)//bcz(1) == 'PP') then
+        call gaussel_periodic(n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
+      else
+        call gaussel(         n_z(1),n_z(2),n_z(3)-q,1,a,b,c,p)
+      end if
     end if
-#endif
     !
+    if(.not.is_no_decomp_z) then
 #if !defined(_DECOMP_Y) && !defined(_DECOMP_Z)
-    !call transpose_z_to_x(pz,px)
-    call transpose_z_to_y(pz,py)
-    call transpose_y_to_x(py,px)
-    !$OMP PARALLEL WORKSHARE
-    p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
-    !$OMP END PARALLEL WORKSHARE
+      !call transpose_z_to_x(pz,px)
+      call transpose_z_to_y(pz,py)
+      call transpose_y_to_x(py,px)
+      !$OMP PARALLEL WORKSHARE
+      p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
+      !$OMP END PARALLEL WORKSHARE
 #elif defined(_DECOMP_Y)
-    call transpose_z_to_y(pz,py)
-    !$OMP PARALLEL WORKSHARE
-    p(1:n(1),1:n(2),1:n(3)) = py(:,:,:)
-    !$OMP END PARALLEL WORKSHARE
+      call transpose_z_to_y(pz,py)
+      !$OMP PARALLEL WORKSHARE
+      p(1:n(1),1:n(2),1:n(3)) = py(:,:,:)
+      !$OMP END PARALLEL WORKSHARE
 #endif
+    end if
   end subroutine solver_gaussel_z
   !
 #if 0
diff --git a/src/solver_gpu.f90 b/src/solver_gpu.f90
index 4d80b5dd..5b34c634 100644
--- a/src/solver_gpu.f90
+++ b/src/solver_gpu.f90
@@ -392,54 +392,59 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p)
     integer :: q
     integer, dimension(3) :: n_x,n_y,n_z,n_z_0
     integer :: istat
+    logical :: is_no_decomp_z
     !
     n_z_0(:) = ap_z_0%shape(:)
-#if !defined(_DECOMP_Z)
     n_x(:) = ap_x%shape(:)
     n_y(:) = ap_y%shape(:)
     n_z(:) = ap_z%shape(:)
-    px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
-    if(cudecomp_is_t_in_place) then
-      py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
-    else
-      py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:)))
+    is_no_decomp_z = n_x(3) == n_z(3) ! not decomposed along z: xsize(3) == ysize(3) == ng(3) when dims(2) = 1
+    if(.not.is_no_decomp_z) then
+      px(1:n_x(1),1:n_x(2),1:n_x(3)) => solver_buf_0(1:product(n_x(:)))
+      if(cudecomp_is_t_in_place) then
+        py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_0(1:product(n_y(:)))
+      else
+        py(1:n_y(1),1:n_y(2),1:n_y(3)) => solver_buf_1(1:product(n_y(:)))
+      end if
+      pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:)))
     end if
-    pz(1:n_z(1),1:n_z(2),1:n_z(3)) => solver_buf_0(1:product(n_z(:)))
-#endif
-    select case(ipencil_axis)
-    case(1)
-      !$acc kernels default(present) async(1)
-      !$OMP PARALLEL WORKSHARE
-      px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
-      !$OMP END PARALLEL WORKSHARE
-      !$acc end kernels
-      !$acc host_data use_device(px,py,pz,work)
-      istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
-      istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
-      !$acc end host_data
-    case(2)
-      block
-        integer :: i,j,k
-        !
-        ! transpose p -> py to axis-contiguous layout
-        !
-        !$acc parallel loop collapse(3) default(present) async(1)
-        !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)
-        do k=1,n(3)
-          do j=1,n(2)
-            do i=1,n(1)
-              py(j,k,i) = p(i,j,k)
+    !
+    if(.not.is_no_decomp_z) then
+      select case(ipencil_axis)
+      case(1)
+        !$acc kernels default(present) async(1)
+        !$OMP PARALLEL WORKSHARE
+        px(:,:,:) = p(1:n(1),1:n(2),1:n(3))
+        !$OMP END PARALLEL WORKSHARE
+        !$acc end kernels
+        !$acc host_data use_device(px,py,pz,work)
+        istat = cudecompTransposeXtoY(ch,gd,px,py,work,dtype_rp,stream=istream)
+        istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+        !$acc end host_data
+      case(2)
+        block
+          integer :: i,j,k
+          !
+          ! transpose p -> py to axis-contiguous layout
+          !
+          !$acc parallel loop collapse(3) default(present) async(1)
+          !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)
+          do k=1,n(3)
+            do j=1,n(2)
+              do i=1,n(1)
+                py(j,k,i) = p(i,j,k)
+              end do
             end do
           end do
-        end do
-      end block
-      !$acc host_data use_device(py,pz,work)
-      istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
-      !$acc end host_data
-    case(3)
-    end select
+        end block
+        !$acc host_data use_device(py,pz,work)
+        istat = cudecompTransposeYtoZ(ch,gd,py,pz,work,dtype_rp,stream=istream)
+        !$acc end host_data
+      case(3)
+      end select
+    end if
     !
-    if(ipencil_axis /= 3) then
+    if(ipencil_axis /= 3 .and. .not.is_no_decomp_z) then
       q = 0
       if(c_or_f(3) == 'f'.and.bcz(1) == 'D') q = 1
       if(bcz(0)//bcz(1) == 'PP') then
@@ -457,38 +462,40 @@ subroutine solver_gaussel_z_gpu(n,a,b,c,bcz,c_or_f,p)
       end if
     end if
     !
-    select case(ipencil_axis)
-    case(1)
-      !$acc host_data use_device(pz,py,px,work)
-      istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
-      istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
-      !$acc end host_data
-      !$acc kernels default(present) async(1)
-      !$OMP PARALLEL WORKSHARE
-      p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
-      !$OMP END PARALLEL WORKSHARE
-      !$acc end kernels
-    case(2)
-      !$acc host_data use_device(pz,py,work)
-      istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
-      !$acc end host_data
-      block
-        integer :: i,j,k
-        !
-        ! transpose py -> p to default layout
-        !
-        !$acc parallel loop collapse(3) default(present) async(1)
-        !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)
-        do k=1,n(3)
-          do j=1,n(2)
-            do i=1,n(1)
-              p(i,j,k) = py(j,k,i)
+    if(.not.is_no_decomp_z) then
+      select case(ipencil_axis)
+      case(1)
+        !$acc host_data use_device(pz,py,px,work)
+        istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+        istat = cudecompTransposeYtoX(ch,gd,py,px,work,dtype_rp,stream=istream)
+        !$acc end host_data
+        !$acc kernels default(present) async(1)
+        !$OMP PARALLEL WORKSHARE
+        p(1:n(1),1:n(2),1:n(3)) = px(:,:,:)
+        !$OMP END PARALLEL WORKSHARE
+        !$acc end kernels
+      case(2)
+        !$acc host_data use_device(pz,py,work)
+        istat = cudecompTransposeZtoY(ch,gd,pz,py,work,dtype_rp,stream=istream)
+        !$acc end host_data
+        block
+          integer :: i,j,k
+          !
+          ! transpose py -> p to default layout
+          !
+          !$acc parallel loop collapse(3) default(present) async(1)
+          !$OMP PARALLEL DO   COLLAPSE(3) DEFAULT(shared)
+          do k=1,n(3)
+            do j=1,n(2)
+              do i=1,n(1)
+                p(i,j,k) = py(j,k,i)
+              end do
             end do
           end do
-        end do
-      end block
-    case(3)
-    end select
+        end block
+      case(3)
+      end select
+    end if
   end subroutine solver_gaussel_z_gpu
 #endif
 #endif