diff --git a/app/main.f90 b/app/main.f90
index 99ca6ce..67fb9b9 100644
--- a/app/main.f90
+++ b/app/main.f90
@@ -9,14 +9,14 @@ program main
   associate(input => input_t())
     output = output_t(input, matcha(input))
     block
-      double precision, allocatable :: simulated_distribution(:,:)
+      double precision, allocatable :: simulated_distribution(:,:), frequency_distribution(:)
       integer, parameter :: freq=2
       integer num_cells
 
       num_cells = output%my_num_cells()
       simulated_distribution = output%simulated_distribution()
-      simulated_distribution(:,freq) = num_cells*simulated_distribution(:,freq)
-      call co_sum(simulated_distribution(:,freq), result_image=1)
+      frequency_distribution =  num_cells*simulated_distribution(:,freq) ! copy to work around nagfor bug
+      call co_sum(frequency_distribution, result_image=1)
       call co_sum(num_cells, result_image=1)
       if (this_image()==1) simulated_distribution(:,freq) = simulated_distribution(:,freq)/dble(num_cells)
     end block
diff --git a/example/heat-equation.f90 b/example/heat-equation.f90
index 3c3cb5c..2e7624c 100644
--- a/example/heat-equation.f90
+++ b/example/heat-equation.f90
@@ -76,13 +76,14 @@ module subroutine exchange_halo(self)
 
   end interface
 
+  real, allocatable :: halo_x(:,:)[:]
+
 end module
 
 submodule(subdomain_2D_m) subdomain_2D_s
   use assertions_m, only : assert
   implicit none
 
-  real, allocatable :: halo_x(:,:)[:]
   real dx_, dy_
   integer, parameter :: west=1, east=2
   integer my_nx, nx, ny, me, num_subdomains, my_internal_west, my_internal_east
@@ -114,7 +115,8 @@ module subroutine exchange_halo(self)
     self%s_(my_nx, 2:ny-1) = merge(boundary_val, internal_val, me==num_subdomains) ! east subdomain boundary
 
     if (allocated(halo_x)) deallocate(halo_x)
-    allocate(halo_x(west:east, ny)[*])
+    !allocate(halo_x(west:east, ny)[*])
+    allocate(halo_x(2, ny)[*])
     call self%exchange_halo
   end procedure
 
@@ -131,7 +133,7 @@ module subroutine exchange_halo(self)
     integer i, j
     real, allocatable :: halo_west(:), halo_east(:)
 
-    call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
+    !call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
     call assert(allocated(halo_x), "subdomain_2D_t%laplacian: allocated(halo_x)")
 
     allocate(laplacian_rhs(my_nx, ny))
@@ -168,8 +170,8 @@ module subroutine exchange_halo(self)
   end procedure
    
   module procedure exchange_halo
-    if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
-    if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
+    !if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
+    !if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
   end procedure
 
   module procedure values
diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
index 1f34b8f..a77008e 100644
--- a/example/time-paradigm.f90
+++ b/example/time-paradigm.f90
@@ -2,7 +2,7 @@
 ! Terms of use are as specified in LICENSE.txt
 program time_paradigm_m
   !! Time various alternative programming paradigms
-  use subdomain_m, only : subdomain_t
+  use subdomain_m, only : subdomain_t, march
   use assert_m, only : assert
   use sourcery_m, only : string_t, file_t, command_line_t, bin_t, csv 
   use iso_fortran_env, only : int64
@@ -12,7 +12,7 @@ program time_paradigm_m
   character(len=:), allocatable :: steps_string, resolution_string
   type(command_line_t) command_line
   integer(int64) counter_start, counter_end, clock_rate
-  integer :: steps=200, resolution=64
+  integer :: steps=300, resolution=128
 
   associate(me => this_image())
     if (command_line%argument_present(["--help"])) then
@@ -52,20 +52,22 @@ function functional_programming_time() result(system_time)
     integer(int64) t_start_functional, t_end_functional, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
 
     call T%define(side=1., boundary_val=T_boundary, internal_val=T_internal_initial, n=resolution)
 
-    call system_clock(t_start_functional)
-
     associate(dt => T%dx()*T%dy()/(4*alpha))
+      call system_clock(t_start_functional)
+
       functional_programming: &
       do step = 1, steps
+        sync all
         T =  T + dt * alpha * .laplacian. T
       end do functional_programming
+
+      call system_clock(t_end_functional, clock_rate)
     end associate
 
-    call system_clock(t_end_functional, clock_rate)
     system_time = real(t_end_functional - t_start_functional)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))
@@ -78,18 +80,22 @@ function procedural_programming_time() result(system_time)
     integer(int64) t_start_procedural, t_end_procedural, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
+
+    call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
 
     associate(dt => T%dx()*T%dy()/(4*alpha))
-      call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
       call system_clock(t_start_procedural)
+
       procedural_programming: &
       do step = 1, steps
-        call T%step(alpha*dt)
+        sync all
+        call march(alpha*dt, T)
       end do procedural_programming
+
+      call system_clock(t_end_procedural, clock_rate)
     end associate
 
-    call system_clock(t_end_procedural, clock_rate)
     system_time = real(t_end_procedural - t_start_procedural)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))
diff --git a/src/matcha/distribution_s.F90 b/src/matcha/distribution_s.f90
similarity index 87%
rename from src/matcha/distribution_s.F90
rename to src/matcha/distribution_s.f90
index fff76fb..d4d14f4 100644
--- a/src/matcha/distribution_s.F90
+++ b/src/matcha/distribution_s.f90
@@ -47,8 +47,11 @@ pure function monotonically_increasing(f) result(monotonic)
       "distribution_t%cumulative_distribution: allocated(cumulative_distribution_)")
     call assert(allocated(self%vel_), "distribution_t%cumulative_distribution: allocated(vel_)")
 
-     ! Sample from the distribution
-     call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    ! Sample from the distribution
+    associate(ncells => size(speeds,1), nsteps => size(speeds,2))
+      allocate(sampled_speeds(ncells,nsteps))
+      call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    end associate
      
      associate(nsteps => size(speeds,2))
 
@@ -63,6 +66,9 @@ pure function monotonically_increasing(f) result(monotonic)
          end associate
        end associate
        
+       if(allocated(my_velocities)) deallocate(my_velocities)
+       allocate(my_velocities, mold=dir)
+    
        call do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities)
        
      end associate
diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
index 21869c1..d28691c 100644
--- a/src/matcha/do_concurrent_m.f90
+++ b/src/matcha/do_concurrent_m.f90
@@ -1,6 +1,6 @@
 module do_concurrent_m
   use iso_c_binding, only : c_double, c_int
-  use t_cell_collection_m, only : t_cell_collection_t, t_cell_collection_bind_C_t
+  use t_cell_collection_m, only : t_cell_collection_bind_C_t
   implicit none
   private
   public :: do_concurrent_sampled_speeds, do_concurrent_my_velocities, do_concurrent_k, do_concurrent_speeds
@@ -11,34 +11,34 @@ module do_concurrent_m
     pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
-      real(c_double), intent(out), allocatable :: sampled_speeds(:,:)
+      real(c_double), intent(out) :: sampled_speeds(:,:)
     end subroutine
     
     pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
       implicit none
       integer(c_int), intent(in) :: nsteps
       real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
-      real(c_double), intent(out), allocatable :: my_velocities(:,:,:)
+      real(c_double), intent(out) :: my_velocities(:,:,:)
     end subroutine
     
     pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:), vel(:)
-      integer(c_int), intent(out), allocatable :: k(:)
+      integer(c_int), intent(out) :: k(:)
     end subroutine
     
-    pure module subroutine &
-      do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) bind(C)
+    pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) &
+      bind(C)
       implicit none
-      integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
+      integer(c_int), intent(in) :: speed, freq, k(:)
       real(c_double), intent(in) :: emp_distribution(:,:)
-      real(c_double), intent(out), allocatable :: output_distribution(:,:)
+      real(c_double), intent(out) :: output_distribution(:,:)
     end subroutine
     
     module subroutine do_concurrent_speeds(history, speeds) bind(C)
       implicit none
       type(t_cell_collection_bind_C_t), intent(in) :: history(:)
-      real(c_double), intent(out), allocatable :: speeds(:)
+      real(c_double), intent(out) :: speeds(:)
     end subroutine
       
   end interface
diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
index 80c37e7..c4798db 100644
--- a/src/matcha/do_concurrent_s.f90
+++ b/src/matcha/do_concurrent_s.f90
@@ -1,15 +1,18 @@
 submodule(do_concurrent_m) do_concurrent_s
+  use assert_m, only : assert
   use iso_c_binding, only : c_f_pointer
   implicit none
 
 contains
   
-  module procedure do_concurrent_sampled_speeds
-  
+  pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
+    real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
+    real(c_double), intent(out) :: sampled_speeds(:,:)
     integer cell, step
+   
+    call assert(all(shape(sampled_speeds)==shape(speeds)), "do_concurrent_sampled_speeds: {sampled_,}speeds shape match")
     
     associate(ncells => size(speeds,1), nsteps => size(speeds,2))
-      allocate(sampled_speeds(ncells,nsteps))
       do concurrent(cell = 1:ncells, step = 1:nsteps)
         associate(k => findloc(speeds(cell,step) >= cumulative_distribution, value=.false., dim=1)-1)
           sampled_speeds(cell,step) = vel(k)
@@ -17,52 +20,53 @@
       end do
     end associate
     
-  end procedure
-  
-  module procedure do_concurrent_my_velocities
-  
+  end subroutine  
+
+  pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
+    integer(c_int), intent(in) :: nsteps
+    real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
+    real(c_double), intent(out) :: my_velocities(:,:,:)
     integer step
     
-    if(allocated(my_velocities)) deallocate(my_velocities)
-    allocate(my_velocities, mold=dir)
-    
+    call assert(all([size(my_velocities,1),size(sampled_speeds,2)] == shape(sampled_speeds)), &
+      "do_concurrent_my_velocities: argument size match")
+    call assert(all(size(my_velocities,1)==shape(dir)), "do_concurrent_my_velocities: argument shape match")
+
     do concurrent(step=1:nsteps)
       my_velocities(:,step,1) = sampled_speeds(:,step)*dir(:,step,1)
       my_velocities(:,step,2) = sampled_speeds(:,step)*dir(:,step,2)
       my_velocities(:,step,3) = sampled_speeds(:,step)*dir(:,step,3)
     end do
-    
-  end procedure
-  
-  module procedure do_concurrent_k
+  end subroutine  
   
-  integer i
+  pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
+    real(c_double), intent(in) :: speeds(:), vel(:)
+    integer(c_int), intent(out) :: k(:)  
+    integer i
   
     associate(nspeeds => size(speeds))
-      if(allocated(k)) deallocate(k)
-      allocate(k(nspeeds))
         do concurrent(i = 1:nspeeds)
           k(i) = findloc(speeds(i) >= vel, value=.false., dim=1)-1
         end do
     end associate
-  end procedure
-  
-  module procedure do_concurrent_output_distribution
+  end subroutine
   
+  pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) bind(C)
+    integer(c_int), intent(in) :: speed, freq, k(:)
+    real(c_double), intent(in) :: emp_distribution(:,:)
+    real(c_double), intent(out) :: output_distribution(:,:)
     integer i
     
-    if(allocated(output_distribution)) deallocate(output_distribution)
-    allocate(output_distribution(nintervals,2))
     output_distribution(:,freq) = 0.d0
     output_distribution(:,speed) = emp_distribution(:,speed)
     do concurrent(i = 1:size(output_distribution,1))
       output_distribution(i,freq) = count(k==i)
     end do
-    
-  end procedure
-  
-  module procedure do_concurrent_speeds
+  end subroutine
   
+  module subroutine do_concurrent_speeds(history, speeds) bind(C)
+    type(t_cell_collection_bind_C_t), intent(in) :: history(:)
+    real(c_double), intent(out) :: speeds(:)  
     integer i, j, k
     integer, parameter :: nspacedims=3
     
@@ -78,19 +82,16 @@
          x(i,:,:) = positions
       end do
   
-      associate(t => history%time)
-        allocate(speeds(ncells*(npositions-1)))
-        do concurrent(i = 1:npositions-1, j = 1:ncells)
-          associate( &
-            u => (x(i+1,j,:) - x(i,j,:))/(t(i+1) - t(i)), &
-            ij => i + (j-1)*(npositions-1) &
-           )   
-            speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
-          end associate
-        end do
-      end associate
+      do concurrent(i = 1:npositions-1, j = 1:ncells)
+        associate( &
+          u => (x(i+1,j,:) - x(i,j,:))/(history(i+1)%time - history(i)%time), &
+          ij => i + (j-1)*(npositions-1) &
+         )   
+          speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
+        end associate
+      end do
     end associate
     
-  end procedure
+  end subroutine
   
-end submodule do_concurrent_s
+end submodule do_concurrent_s
\ No newline at end of file
diff --git a/src/matcha/output_s.f90 b/src/matcha/output_s.f90
index 1bac989..ec621bf 100644
--- a/src/matcha/output_s.f90
+++ b/src/matcha/output_s.f90
@@ -3,7 +3,7 @@
 submodule(output_m) output_s
   use do_concurrent_m, only : do_concurrent_k, do_concurrent_output_distribution, do_concurrent_speeds
   use t_cell_collection_m, only : t_cell_collection_bind_C_t
-  use iso_c_binding, only : c_loc, c_double
+  use iso_c_binding, only : c_double
   implicit none
   
 contains
@@ -24,16 +24,26 @@
     
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
 
+    associate(npositions => size(self%history_))
+      allocate(speeds(self%my_num_cells()*(npositions-1)))
+    end associate
     call do_concurrent_speeds(t_cell_collection_bind_C_t(self%history_), speeds)
 
-    associate(emp_distribution => self%input_%sample_distribution())
+    block
+      real(c_double), allocatable :: emp_distribution(:,:)
+
+      emp_distribution = self%input_%sample_distribution()
       associate(nintervals => size(emp_distribution(:,1)), dvel_half => (emp_distribution(2,speed)-emp_distribution(1,speed))/2.d0)
         vel = [emp_distribution(1,speed) - dvel_half, [(emp_distribution(i,speed) + dvel_half, i=1,nintervals)]]
+        if (allocated(k)) deallocate(k)
+        allocate(k(size(speeds)))
         call do_concurrent_k(speeds, vel, k)
-        call do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution)
+        if(allocated(output_distribution)) deallocate(output_distribution)
+        allocate(output_distribution(nintervals,2))
+        call do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution)
         output_distribution(:,freq) = output_distribution(:,freq)/sum(output_distribution(:,freq))
       end associate
-    end associate
+    end block
 
   end procedure
 
diff --git a/src/matcha/subdomain_m.f90 b/src/matcha/subdomain_m.f90
index 72c9cbf..f2a22d0 100644
--- a/src/matcha/subdomain_m.f90
+++ b/src/matcha/subdomain_m.f90
@@ -3,29 +3,35 @@ module subdomain_m
 
   private
   public :: subdomain_t
+  public :: march
 
   type subdomain_t 
     private
     real, allocatable :: s_(:,:,:)
   contains
     procedure, pass(self) :: define
-    procedure, pass(self) :: step
-    procedure, pass(rhs) :: multiply
-    generic :: operator(.laplacian.) => laplacian
-    generic :: operator(*) => multiply
-    generic :: operator(+) => add
-    generic :: assignment(=) => assign_and_sync
     procedure dx
     procedure dy
     procedure dz
     procedure values
+    generic :: operator(*) => multiply
+    generic :: operator(+) => add
+    generic :: operator(.laplacian.) => laplacian
+    generic :: assignment(=) => assign_
+    procedure, private, pass(rhs) :: multiply
     procedure, private :: laplacian
     procedure, private :: add
-    procedure, private :: assign_and_sync
+    procedure, private :: assign_
   end type
 
   interface
 
+    pure module function laplacian(rhs) result(laplacian_rhs)
+      implicit none
+      class(subdomain_t), intent(in) :: rhs[*]
+      type(subdomain_t) laplacian_rhs
+    end function
+
     module subroutine define(side, boundary_val, internal_val, n, self)
       implicit none
       real, intent(in) :: side, boundary_val, internal_val
@@ -33,10 +39,10 @@ module subroutine define(side, boundary_val, internal_val, n, self)
       class(subdomain_t), intent(out) :: self
     end subroutine
 
-    module subroutine step(alpha_dt, self)
+    module subroutine march(alpha_dt, self)
       implicit none
       real, intent(in) :: alpha_dt
-      class(subdomain_t), intent(inout) :: self
+      type(subdomain_t), intent(inout) :: self[*]
     end subroutine
 
     pure module function values(self) result(my_values)
@@ -63,12 +69,6 @@ pure module function dz(self) result(my_dz)
       real my_dz
     end function
 
-    pure module function laplacian(rhs) result(laplacian_rhs)
-      implicit none
-      class(subdomain_t), intent(in) :: rhs
-      type(subdomain_t) laplacian_rhs
-    end function
-
     pure module function multiply(lhs, rhs) result(product)
       implicit none
       class(subdomain_t), intent(in) :: rhs
@@ -83,7 +83,7 @@ pure module function add(lhs, rhs) result(total)
       type(subdomain_t) total
     end function
 
-    module subroutine assign_and_sync(lhs, rhs)
+    module subroutine assign_(lhs, rhs)
       implicit none
       class(subdomain_t), intent(out) :: lhs
       type(subdomain_t), intent(in) :: rhs
diff --git a/src/matcha/subdomain_s.f90 b/src/matcha/subdomain_s.f90
index 2ca5eb9..f5bfa0c 100644
--- a/src/matcha/subdomain_s.f90
+++ b/src/matcha/subdomain_s.f90
@@ -1,14 +1,9 @@
 submodule(subdomain_m) subdomain_s
+  use assert_m, only : assert, intrinsic_array_t
   use sourcery_m, only : data_partition_t
-  use assert_m, only : assert
-  use intrinsic_array_m, only : intrinsic_array_t
   implicit none
 
-  real, allocatable :: halo_x(:,:,:)[:]
-  integer, parameter :: west=1, east=2
-
   type(data_partition_t) data_partition
-
   real dx_, dy_, dz_
   integer my_nx, nx, ny, nz, me, num_subdomains, my_internal_west, my_internal_east
   real, allocatable :: increment(:,:,:)
@@ -50,12 +45,6 @@
 
     if (me == 1)              self%s_(1    , :, :) = boundary_val ! minimum x boundary
     if (me == num_subdomains) self%s_(my_nx, :, :) = boundary_val ! maximum x boundary
-
-    if (allocated(halo_x)) deallocate(halo_x)
-    allocate(halo_x(west:east, ny, nz)[*])
-    if (me>1) halo_x(east,:,:)[me-1] = self%s_(1,:,:)
-    if (me<num_subdomains) halo_x(west,:,:)[me+1] = self%s_(my_nx,:,:)
-    sync all
   end procedure
 
   module procedure dx
@@ -70,66 +59,19 @@
     my_dz = dz_
   end procedure
 
-  module procedure laplacian
-
-    integer i, j, k
-    real, allocatable :: halo_west(:,:), halo_east(:,:)
-
-    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-    call assert(allocated(halo_x), "subdomain_t%laplacian: allocated(halo_x)")
-
-    allocate(laplacian_rhs%s_(my_nx, ny, nz))
-
-    halo_west = merge(halo_x(west,:,:), rhs%s_(1,:,:), me/=1)
-    i = my_internal_west
-    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    halo_east = merge(halo_x(east,:,:), rhs%s_(my_nx,:,:), me/=num_subdomains)
-    i = my_internal_east
-    call assert(i-1>0,"laplacian: easternmost subdomain too small")
-    do concurrent(j=2:ny-1, k=2:nz-1)
-      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
-                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
-                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
-    end do
-
-    laplacian_rhs%s_(:, 1,:) = 0.
-    laplacian_rhs%s_(:,ny,:) = 0.
-    laplacian_rhs%s_(:,:, 1) = 0.
-    laplacian_rhs%s_(:,:,nz) = 0.
-    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
-    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
-
+  module procedure add
+    call assert(allocated(lhs%s_) .and. allocated(rhs%s_), "subdomain_t%add: allocated(rhs%s_)")
+    total%s_ = lhs%s_ +  rhs%s_
   end procedure
 
   module procedure multiply
     call assert(allocated(rhs%s_), "subdomain_t%multiply: allocated(rhs%s_)")
-    product%s_ =  lhs * rhs%s_
+    product%s_ = lhs +  rhs%s_
   end procedure
 
-  module procedure add
-    call assert(allocated(rhs%s_), "subdomain_t%add: allocated(rhs%s_)")
-    total%s_ =  lhs%s_ + rhs%s_
-  end procedure
-
-  module procedure assign_and_sync
-    call assert(allocated(rhs%s_), "subdomain_t%assign_and_sync: allocated(rhs%s_)")
-    sync all
+  module procedure assign_
+    call assert(allocated(rhs%s_), "subdomain_t%assign_: allocated(rhs%s_)")
     lhs%s_ =  rhs%s_
-    if (me>1) halo_x(east,:,:)[me-1] = rhs%s_(1,:,:)
-    if (me<num_subdomains) halo_x(west,:,:)[me+1] = rhs%s_(my_nx,:,:)
-    sync all
   end procedure
 
   module procedure values
@@ -137,23 +79,19 @@
     my_values =  self%s_
   end procedure
 
-  module procedure step
+  module procedure march
 
     call assert(allocated(self%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
-    call assert(allocated(halo_x), "subdomain_t%laplacian: allocated(halo_x)")
     call assert(my_internal_west+1<=my_nx,"laplacian: westernmost subdomain too small")
     call assert(my_internal_east-1>0,"laplacian: easternmost subdomain too small")
 
     if (.not. allocated(increment)) allocate(increment(my_nx,ny,nz))
  
+    sync all
     call internal_points(increment)
-    call edge_points(increment)
+    call edge_points(self, increment)
     call apply_boundary_condition(increment)
-
-    sync all
     self%s_ = self%s_ + increment
-    sync all
-    call exchange_halo(self%s_)
 
   contains
 
@@ -170,14 +108,17 @@ subroutine internal_points(ds)
       end do
     end subroutine
 
-    subroutine edge_points(ds)
+    subroutine edge_points(self, ds)
+      type(subdomain_t), intent(in) :: self[*]
       real, intent(inout) :: ds(:,:,:)
       real, allocatable :: halo_west(:,:), halo_east(:,:)
       integer i, j, k
 
-      halo_west = merge(halo_x(west,:,:), self%s_(1,    :,:), me/=1)
-      halo_east = merge(halo_x(east,:,:), self%s_(my_nx,:,:), me/=num_subdomains)
-
+      if (me==1) then
+        halo_west = self%s_(1,:,:)
+      else
+        halo_west = self[me-1]%s_(ubound(self[me-1]%s_,1),:,:)
+      end if
       i = my_internal_west
       do concurrent(j=2:ny-1,k=2:nz-1)
         ds(i,j,k) = alpha_dt*( &
@@ -187,6 +128,11 @@ subroutine edge_points(ds)
         )
       end do
 
+      if (me==1) then
+        halo_east = self%s_(my_nx,:,:)
+      else
+        halo_east = self[me+1]%s_(1,:,:)
+      end if
       i = my_internal_east
       do concurrent(j=2:ny-1, k=2:nz-1)
         ds(i,j,k) = alpha_dt*( &
@@ -199,7 +145,6 @@ subroutine edge_points(ds)
 
     subroutine apply_boundary_condition(ds)
       real, intent(inout) :: ds(:,:,:)
-      integer i, j
 
       ds(:,1:ny:ny-1, :       ) = 0.
       ds(:, :       ,1:nz:nz-1) = 0.
@@ -207,12 +152,58 @@ subroutine apply_boundary_condition(ds)
       if (me==num_subdomains) ds(my_nx,:,:) = 0.
     end subroutine
 
-    subroutine exchange_halo(s)
-      real, intent(in) :: s(:,:,:)
-      if (me>1) halo_x(east,:,:)[me-1] = s(1,:,:)
-      if (me<num_subdomains) halo_x(west,:,:)[me+1] = s(my_nx,:,:)
-    end subroutine
-
   end procedure
 
-end submodule subdomain_s
\ No newline at end of file
+  pure module function laplacian(rhs) result(laplacian_rhs)
+    class(subdomain_t), intent(in) :: rhs[*]
+    type(subdomain_t) laplacian_rhs
+
+    integer i, j, k
+    real, allocatable :: halo_west(:,:), halo_east(:,:)
+
+    call assert(allocated(rhs%s_), "subdomain_t%laplacian: allocated(rhs%s_)")
+
+    allocate(laplacian_rhs%s_, mold=rhs%s_)
+
+    if (me==1) then
+      halo_west = rhs%s_(1,:,:)
+    else
+      halo_west = rhs[me-1]%s_(ubound(rhs[me-1]%s_,1),:,:)
+    end if
+    i = my_internal_west
+    call assert(i+1<=my_nx,"laplacian: westernmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = ( halo_west(j,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    do concurrent(i=my_internal_west+1:my_internal_east-1, j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i+1,j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    if (me==1) then
+      halo_east = rhs%s_(1,:,:)
+    else
+      halo_east = rhs[me+1]%s_(lbound(rhs[me+1]%s_,1),:,:)
+    end if
+    i = my_internal_east
+    call assert(i-1>0,"laplacian: easternmost subdomain too small")
+    do concurrent(j=2:ny-1, k=2:nz-1)
+      laplacian_rhs%s_(i,j,k) = (rhs%s_(i-1,j  ,k  ) - 2*rhs%s_(i,j,k) +  halo_east(j  ,k  ))/dx_**2 + &
+                                (rhs%s_(i  ,j-1,k  ) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j+1,k  ))/dy_**2 + &
+                                (rhs%s_(i  ,j  ,k-1) - 2*rhs%s_(i,j,k) + rhs%s_(i  ,j  ,k+1))/dz_**2
+    end do
+
+    laplacian_rhs%s_(:, 1,:) = 0.
+    laplacian_rhs%s_(:,ny,:) = 0.
+    laplacian_rhs%s_(:,:, 1) = 0.
+    laplacian_rhs%s_(:,:,nz) = 0.
+    if (me==1) laplacian_rhs%s_(1,:,:) = 0.
+    if (me==num_subdomains) laplacian_rhs%s_(my_nx,:,:) = 0.
+
+  end function
+
+end submodule subdomain_s
diff --git a/src/matcha/t_cell_collection_m.f90 b/src/matcha/t_cell_collection_m.f90
index 323d47b..03789ac 100644
--- a/src/matcha/t_cell_collection_m.f90
+++ b/src/matcha/t_cell_collection_m.f90
@@ -42,8 +42,9 @@ pure module function construct(positions, time) result(t_cell_collection)
     
   interface t_cell_collection_bind_C_t
     
-    elemental module function construct_bind_C(t_cell_collection) result(t_cell_collection_bind_C)
+    impure elemental module function construct_bind_C(t_cell_collection) result(t_cell_collection_bind_C)
       !! Result is bind(C) representation of the data inside a t_cell_collection_t object
+      !! This function is impure because it invokes c_loc. Fortran 2023 compliance will allow this function to be pure.
       implicit none
       type(t_cell_collection_t), intent(in), target :: t_cell_collection
       type(t_cell_collection_bind_C_t) t_cell_collection_bind_C
@@ -60,7 +61,6 @@ pure module function positions(self) result(my_positions)
       double precision, allocatable :: my_positions(:,:)
     end function
     
-    
     elemental module function time(self) result(my_time)
       !! Return the t_cell_collection_t object's time stamp
       implicit none
diff --git a/src/matcha/t_cell_collection_s.F90 b/src/matcha/t_cell_collection_s.f90
similarity index 100%
rename from src/matcha/t_cell_collection_s.F90
rename to src/matcha/t_cell_collection_s.f90
diff --git a/src/matcha_s.F90 b/src/matcha_s.F90
index 4735efc..0cc9081 100644
--- a/src/matcha_s.F90
+++ b/src/matcha_s.F90
@@ -32,7 +32,9 @@
         associate(me => this_image())
           associate(my_num_cells => data_partition%last(me) - data_partition%first(me) + 1)
           
+#ifndef NAGFOR
             call random_init(repeatable=.true., image_distinct=.true.)
+#endif
             
             allocate(random_positions(my_num_cells,ndim))
             call random_number(random_positions)  
diff --git a/test/matcha_test_m.f90 b/test/matcha_test_m.f90
index f75e164..20ffe4c 100644
--- a/test/matcha_test_m.f90
+++ b/test/matcha_test_m.f90
@@ -67,7 +67,7 @@ function compare_image_distributions() result(test_passes)
   function compare_global_distributions() result(test_passes)
     logical test_passes
     type(output_t) output
-    double precision, allocatable :: simulated_distribution(:,:)
+    double precision, allocatable :: simulated_distribution(:,:), frequency_distribution(:)
     integer num_cells
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
     real, parameter :: tolerance = 1.D-02
@@ -77,16 +77,16 @@ function compare_global_distributions() result(test_passes)
       associate(empirical_distribution => input%sample_distribution())
         simulated_distribution = output%simulated_distribution()  
         num_cells = output%my_num_cells()
-        simulated_distribution(:,freq) = num_cells*simulated_distribution(:,freq)
-        call co_sum(simulated_distribution(:,freq), result_image=1)
+        frequency_distribution = num_cells*simulated_distribution(:,freq) ! copy to work around nagfor 7.1 Build 7145 compiler bug
+        call co_sum(frequency_distribution, result_image=1)
         call co_sum(num_cells, result_image=1)
         if (this_image()/=1) then
           test_passes = .true.
         else
-          simulated_distribution(:,freq) = simulated_distribution(:,freq)/dble(num_cells)
+          frequency_distribution = frequency_distribution/dble(num_cells)
           associate( &
             diffmax_speeds=> maxval(abs(empirical_distribution(:,speed)-simulated_distribution(:,speed))), &
-            diffmax_freqs => maxval(abs(empirical_distribution(:,freq)-simulated_distribution(:,freq))) &
+            diffmax_freqs => maxval(abs(empirical_distribution(:,freq)-frequency_distribution)) &
           )
             test_passes = (diffmax_freqs < tolerance) .and. (diffmax_speeds < tolerance)
           end associate
diff --git a/test/subdomain_test_m.f90 b/test/subdomain_test_m.f90
index 144ab6e..5b506e3 100644
--- a/test/subdomain_test_m.f90
+++ b/test/subdomain_test_m.f90
@@ -3,7 +3,7 @@
 module subdomain_test_m
   !! Define subdomain tests and procedures required for reporting results
   use sourcery_m, only : test_t, test_result_t
-  use subdomain_m, only : subdomain_t
+  use subdomain_m, only : subdomain_t, march
   use assert_m, only : assert
   implicit none
 
@@ -53,7 +53,8 @@ subroutine output(v)
     critical
       do j = 1, size(v,2)
         do k = 1, size(v,3)
-          print *,"image ",this_image(),": ",j,k,v(:,j,k)
+          !print *,"image ",this_image(),": ",j,k,v(:,j,k)
+          print *,j,k,v(:,j,k)
         end do
       end do
     end critical
@@ -62,10 +63,12 @@ subroutine output(v)
 
   function concave_laplacian() result(test_passes)
     logical test_passes
-    type(subdomain_t) f, laplacian_f
+    type(subdomain_t), save :: f[*]
+    type(subdomain_t) :: laplacian_f
     real, allocatable :: lap_f_vals(:,:,:)
 
-    call f%define(side=1., boundary_val=1., internal_val=2., n=21) ! internally constant subdomain with a step down at all surfaces
+    call f%define(side=1., boundary_val=1., internal_val=2., n=32) ! internally constant subdomain with a step down at all surfaces
+    sync all
     laplacian_f = .laplacian. f
     lap_f_vals = laplacian_f%values()
 
@@ -153,15 +156,16 @@ function concave_laplacian() result(test_passes)
 
   function correct_steady_state() result(test_passes)
     logical test_passes
-    type(subdomain_t) T
-    real, parameter :: T_boundary = 1., T_initial = 2., tolerance = 0.01, T_steady = T_boundary, alpha = 1.
-    integer, parameter :: steps = 6000
+    type(subdomain_t), save :: T[*]
+    real, parameter :: T_boundary = 1., T_initial = 2., tolerance = 5.E-03, T_steady = T_boundary, alpha = 1.
+    integer, parameter :: steps = 25000
     integer step
 
-    call T%define(side=1., boundary_val=T_boundary, internal_val=T_initial, n=21) ! const. internally with a step down at boundaries
+    call T%define(side=1., boundary_val=T_boundary, internal_val=T_initial, n=32) ! const. internally with a step down at boundaries
 
     associate(dt => T%dx()*T%dy()*T%dz()/(4*alpha))
       do step = 1, steps
+        sync all
         T =  T + dt * alpha * .laplacian. T
       end do
     end associate
@@ -174,7 +178,7 @@ function correct_steady_state() result(test_passes)
   function functional_matches_procedural() result(test_passes)
     logical test_passes
     real, parameter :: tolerance = 1.E-06
-    integer, parameter :: steps = 6000, n=21
+    integer, parameter :: steps = 6000, n=32
     real, parameter :: alpha = 1.
     real, parameter :: side=1., boundary_val=1., internal_val=2.
 
@@ -188,13 +192,14 @@ function functional_matches_procedural() result(test_passes)
 
     function T_functional()
       real, allocatable :: T_functional(:,:,:)
-      type(subdomain_t) T
+      type(subdomain_t), save :: T[*]
       integer step
 
       call T%define(side, boundary_val, internal_val, n)
 
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
+          sync all
           T =  T + dt * alpha * .laplacian. T
         end do
       end associate
@@ -204,14 +209,15 @@ function T_functional()
 
     function T_procedural()
       real, allocatable :: T_procedural(:,:,:)
-      type(subdomain_t) T
+      type(subdomain_t), save :: T[*]
       integer step
 
       call T%define(side, boundary_val, internal_val, n)
 
       associate(dt => T%dx()*T%dy()/(4*alpha))
         do step = 1, steps
-          call T%step(alpha*dt)
+          sync all
+          call march(alpha*dt, T)
         end do
       end associate