diff --git a/benchmark/core/Cabana_BinSortPerformance.cpp b/benchmark/core/Cabana_BinSortPerformance.cpp
index 796a74e2d..b508f0728 100644
--- a/benchmark/core/Cabana_BinSortPerformance.cpp
+++ b/benchmark/core/Cabana_BinSortPerformance.cpp
@@ -203,10 +203,12 @@ int main( int argc, char* argv[] )
     if ( argc > 2 )
         run_type = argv[2];
     std::vector<int> problem_sizes = { 1000, 10000 };
+    std::vector<int> host_problem_sizes = problem_sizes;
     std::vector<int> num_bins = { 10, 100, 1000, 10000 };
     if ( run_type == "large" )
     {
         problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 };
+        host_problem_sizes = { 1000, 10000, 100000 };
         num_bins = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 };
     }
 
@@ -227,7 +229,8 @@ int main( int argc, char* argv[] )
         performanceTest<device_type>( file, "device_", problem_sizes,
                                       num_bins );
     }
-    performanceTest<host_device_type>( file, "host_", problem_sizes, num_bins );
+    performanceTest<host_device_type>( file, "host_", host_problem_sizes,
+                                       num_bins );
 
     // Close the output file on rank 0.
     file.close();
diff --git a/benchmark/core/Cabana_LinkedCellPerformance.cpp b/benchmark/core/Cabana_LinkedCellPerformance.cpp
index 4d8db2dfa..f513c5baa 100644
--- a/benchmark/core/Cabana_LinkedCellPerformance.cpp
+++ b/benchmark/core/Cabana_LinkedCellPerformance.cpp
@@ -190,10 +190,13 @@ int main( int argc, char* argv[] )
     if ( argc > 2 )
         run_type = argv[2];
     std::vector<int> problem_sizes = { 100, 1000 };
+    std::vector<int> host_problem_sizes = problem_sizes;
     std::vector<double> cutoff_ratios = { 3.0, 4.0 };
     if ( run_type == "large" )
-        problem_sizes = { 1000, 10000, 100000, 1000000 };
-
+    {
+        problem_sizes = { 1000, 10000, 100000, 1000000, 10000000, 100000000 };
+        host_problem_sizes = { 1000, 10000, 100000 };
+    }
     // Open the output file on rank 0.
     std::fstream file;
     file.open( filename, std::fstream::out );
@@ -211,7 +214,7 @@ int main( int argc, char* argv[] )
         performanceTest<device_type>( file, "device_", problem_sizes,
                                       cutoff_ratios );
     }
-    performanceTest<host_device_type>( file, "host_", problem_sizes,
+    performanceTest<host_device_type>( file, "host_", host_problem_sizes,
                                        cutoff_ratios );
 
     // Close the output file on rank 0.
diff --git a/benchmark/core/Cabana_NeighborArborXPerformance.cpp b/benchmark/core/Cabana_NeighborArborXPerformance.cpp
index e7647c417..b22201d6c 100644
--- a/benchmark/core/Cabana_NeighborArborXPerformance.cpp
+++ b/benchmark/core/Cabana_NeighborArborXPerformance.cpp
@@ -127,11 +127,49 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
                 // Create the neighbor list.
                 double cutoff = cutoff_ratios[c];
                 create_timer.start( pid );
+
+                // Note: this needs to match the neighbor function used below
+                // (only necessary for the neighbor statistics printing).
+                using neigh_type =
+                    Cabana::Experimental::Dense<memory_space, ListTag>;
+
                 auto const nlist = Cabana::Experimental::make2DNeighborList(
                     ListTag{}, Cabana::slice<0>( aosoas[p], "position" ), 0,
                     num_p, cutoff );
                 create_timer.stop( pid );
 
+                // Print neighbor statistics once per system.
+                if ( t == 0 )
+                {
+                    std::size_t max_neigh;
+                    Kokkos::Max<std::size_t> max_reducer( max_neigh );
+                    std::size_t min_neigh;
+                    Kokkos::Min<std::size_t> min_reducer( min_neigh );
+                    std::size_t total_neigh;
+                    Kokkos::Sum<std::size_t> total_reducer( total_neigh );
+                    Kokkos::parallel_reduce(
+                        "Cabana::Benchmark::countNeighbors", policy,
+                        KOKKOS_LAMBDA( const int p, std::size_t& min,
+                                       std::size_t& max, std::size_t& sum ) {
+                            auto const val =
+                                Cabana::NeighborList<neigh_type>::numNeighbor(
+                                    nlist, p );
+                            if ( val < min )
+                                min = val;
+                            if ( val > max )
+                                max = val;
+                            sum += val;
+                        },
+                        min_reducer, max_reducer, total_reducer );
+                    Kokkos::fence();
+                    std::cout << "List min neighbors: " << min_neigh
+                              << std::endl;
+                    std::cout << "List max neighbors: " << max_neigh
+                              << std::endl;
+                    std::cout << "List avg neighbors: " << total_neigh / num_p
+                              << std::endl;
+                    std::cout << std::endl;
+                }
                 // Iterate through the neighbor list.
                 iteration_timer.start( pid );
                 Cabana::neighbor_parallel_for( policy, count_op, nlist,
@@ -175,10 +213,12 @@ int main( int argc, char* argv[] )
     if ( argc > 2 )
         run_type = argv[2];
     std::vector<int> problem_sizes = { 100, 1000 };
+    std::vector<int> host_problem_sizes = problem_sizes;
     std::vector<double> cutoff_ratios = { 2.0, 3.0 };
     if ( run_type == "large" )
     {
-        problem_sizes = { 1000, 10000, 100000, 1000000 };
+        problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 };
+        host_problem_sizes = { 1000, 10000, 100000 };
         cutoff_ratios = { 3.0, 4.0, 5.0 };
     }
 
@@ -199,7 +239,7 @@ int main( int argc, char* argv[] )
         performanceTest<device_type>( file, "device_", problem_sizes,
                                       cutoff_ratios );
     }
-    performanceTest<host_device_type>( file, "host_", problem_sizes,
+    performanceTest<host_device_type>( file, "host_", host_problem_sizes,
                                        cutoff_ratios );
 
     // Close the output file on rank 0.
diff --git a/benchmark/core/Cabana_NeighborVerletPerformance.cpp b/benchmark/core/Cabana_NeighborVerletPerformance.cpp
index 907fb3b73..310fd0ddb 100644
--- a/benchmark/core/Cabana_NeighborVerletPerformance.cpp
+++ b/benchmark/core/Cabana_NeighborVerletPerformance.cpp
@@ -39,6 +39,8 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
     using LayoutTag = Cabana::VerletLayout2D;
     using BuildTag = Cabana::TeamVectorOpTag;
     using IterTag = Cabana::SerialOpTag;
+    using neigh_type =
+        Cabana::VerletList<memory_space, ListTag, LayoutTag, BuildTag>;
 
     // Declare problem sizes.
     int num_problem_size = problem_sizes.size();
@@ -139,11 +141,9 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
                     // Create the neighbor list.
                     double cutoff = cutoff_ratios[c0];
                     create_timer.start( pid );
-                    Cabana::VerletList<memory_space, ListTag, LayoutTag,
-                                       BuildTag>
-                        nlist( Cabana::slice<0>( aosoas[p], "position" ), 0,
-                               num_p, cutoff, cell_ratios[c1], grid_min,
-                               grid_max );
+                    neigh_type nlist( Cabana::slice<0>( aosoas[p], "position" ),
+                                      0, num_p, cutoff, cell_ratios[c1],
+                                      grid_min, grid_max );
                     create_timer.stop( pid );
 
                     // Iterate through the neighbor list.
@@ -157,39 +157,31 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
                     // Print neighbor statistics once per system.
                     if ( t == 0 )
                     {
-                        Kokkos::MinMaxScalar<int> min_max;
-                        Kokkos::MinMax<int> reducer( min_max );
-                        auto const& nlist_data_count =
-                            nlist._data.counts; // capture just the view
+                        std::size_t max_neigh;
+                        Kokkos::Max<std::size_t> max_reducer( max_neigh );
+                        std::size_t min_neigh;
+                        Kokkos::Min<std::size_t> min_reducer( min_neigh );
+                        std::size_t total_neigh;
+                        Kokkos::Sum<std::size_t> total_reducer( total_neigh );
                         Kokkos::parallel_reduce(
-                            "Cabana::countMinMax", policy,
-                            KOKKOS_LAMBDA(
-                                const int p,
-                                Kokkos::MinMaxScalar<int>& local_minmax ) {
-                                auto const val = nlist_data_count( p );
-                                if ( val < local_minmax.min_val )
-                                {
-                                    local_minmax.min_val = val;
-                                }
-                                if ( val > local_minmax.max_val )
-                                {
-                                    local_minmax.max_val = val;
-                                }
+                            "Cabana::Benchmark::countNeighbors", policy,
+                            KOKKOS_LAMBDA( const int p, std::size_t& min,
+                                           std::size_t& max,
+                                           std::size_t& sum ) {
+                                auto const val = Cabana::NeighborList<
+                                    neigh_type>::numNeighbor( nlist, p );
+                                if ( val < min )
+                                    min = val;
+                                if ( val > max )
+                                    max = val;
+                                sum += val;
                             },
-                            reducer );
+                            min_reducer, max_reducer, total_reducer );
                         Kokkos::fence();
-                        std::cout << "List min neighbors: " << min_max.min_val
+                        std::cout << "List min neighbors: " << min_neigh
                                   << std::endl;
-                        std::cout << "List max neighbors: " << min_max.max_val
+                        std::cout << "List max neighbors: " << max_neigh
                                   << std::endl;
-                        int total_neigh = 0;
-                        Kokkos::parallel_reduce(
-                            "Cabana::countSum", policy,
-                            KOKKOS_LAMBDA( const int p, int& nsum ) {
-                                nsum += nlist._data.counts( p );
-                            },
-                            total_neigh );
-                        Kokkos::fence();
                         std::cout
                             << "List avg neighbors: " << total_neigh / num_p
                             << std::endl;
@@ -232,11 +224,13 @@ int main( int argc, char* argv[] )
     if ( argc > 2 )
         run_type = argv[2];
     std::vector<int> problem_sizes = { 100, 1000 };
+    std::vector<int> host_problem_sizes = problem_sizes;
     std::vector<double> cutoff_ratios = { 2.0, 3.0 };
     std::vector<double> cell_ratios = { 1.0 };
     if ( run_type == "large" )
     {
-        problem_sizes = { 1000, 10000, 100000, 1000000 };
+        problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 };
+        host_problem_sizes = { 1000, 10000, 100000 };
         cutoff_ratios = { 3.0, 4.0, 5.0 };
         cell_ratios = { 1.0 };
     }
@@ -258,7 +252,7 @@ int main( int argc, char* argv[] )
         performanceTest<device_type>( file, "device_", problem_sizes,
                                       cutoff_ratios, cell_ratios );
     }
-    performanceTest<host_device_type>( file, "host_", problem_sizes,
+    performanceTest<host_device_type>( file, "host_", host_problem_sizes,
                                        cutoff_ratios, cell_ratios );
 
     // Close the output file on rank 0.
diff --git a/benchmark/grid/Cabana_Grid_HaloPerformance.cpp b/benchmark/grid/Cabana_Grid_HaloPerformance.cpp
index 31a922159..fb8a862ee 100644
--- a/benchmark/grid/Cabana_Grid_HaloPerformance.cpp
+++ b/benchmark/grid/Cabana_Grid_HaloPerformance.cpp
@@ -214,13 +214,13 @@ int main( int argc, char* argv[] )
     // Don't run twice on the CPU if only host enabled.
     if ( !std::is_same<device_type, host_device_type>{} )
     {
-        performanceTest<device_type>( file, partitioner,
-                                      grid_sizes_per_dim_per_rank, "device_",
-                                      halo_widths, MPI_COMM_WORLD );
+        performanceTest<device_type>(
+            file, partitioner, grid_sizes_per_dim_per_rank, "device_device_",
+            halo_widths, MPI_COMM_WORLD );
     }
-    performanceTest<host_device_type>( file, partitioner,
-                                       grid_sizes_per_dim_per_rank, "host_",
-                                       halo_widths, MPI_COMM_WORLD );
+    performanceTest<host_device_type>(
+        file, partitioner, grid_sizes_per_dim_per_rank, "host_host_",
+        halo_widths, MPI_COMM_WORLD );
 
     // Finalize
     Kokkos::finalize();
diff --git a/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp b/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp
index 59c19d20d..b649123dd 100644
--- a/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp
+++ b/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp
@@ -177,8 +177,9 @@ void g2p2g( const ScalarValueP2GType& scalar_p2g, const Coordinates& points,
 // Performance test.
 template <class Device>
 void performanceTest( std::ostream& stream, const std::string& test_prefix,
-                      std::vector<int> cells_per_dim,
-                      std::vector<int> particles_per_cell )
+                      std::vector<int> cells_per_dim_per_rank,
+                      std::vector<int> particles_per_cell,
+                      const DimBlockPartitioner<3> partitioner, MPI_Comm comm )
 {
     using exec_space = typename Device::execution_space;
     using memory_space = typename Device::memory_space;
@@ -192,7 +193,7 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
     std::array<bool, 3> is_dim_periodic = { false, false, false };
 
     // System sizes
-    int num_problem_size = cells_per_dim.size();
+    int num_problem_size = cells_per_dim_per_rank.size();
     int num_particles_per_cell = particles_per_cell.size();
 
     // Define the particle types.
@@ -201,7 +202,6 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
     using aosoa_type = Cabana::AoSoA<member_types, memory_space>;
 
     // Define properties that do not depend on mesh size.
-    DimBlockPartitioner<3> partitioner;
     int halo_width = 1;
     uint64_t seed = 1938347;
 
@@ -298,9 +298,15 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
         for ( int n = 0; n < num_problem_size; ++n )
         {
             // Create the global grid
-            double cell_size = 1.0 / cells_per_dim[n];
+            auto ranks_per_dim =
+                partitioner.ranksPerDimension( comm, { 0, 0, 0 } );
+            std::array<int, 3> cells_per_dim;
+            for ( int d = 0; d < 3; ++d )
+            {
+                cells_per_dim[d] = cells_per_dim_per_rank[n] * ranks_per_dim[d];
+            }
             auto global_mesh = createUniformGlobalMesh(
-                global_low_corner, global_high_corner, cell_size );
+                global_low_corner, global_high_corner, cells_per_dim );
             auto global_grid = createGlobalGrid( MPI_COMM_WORLD, global_mesh,
                                                  is_dim_periodic, partitioner );
 
@@ -500,32 +506,32 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix,
         }
 
         // Output results
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_scalar_value_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_vector_value_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_scalar_gradient_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_vector_divergence_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_tensor_divergence_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_scalar_value_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_vector_value_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_scalar_gradient_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_vector_gradient_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_vector_divergence_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       p2g_fused_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p_fused_timer );
-        outputResults( stream, "grid_size_per_dim", cells_per_dim,
-                       g2p2g_fused_timer );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_scalar_value_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_vector_value_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_scalar_gradient_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_vector_divergence_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_tensor_divergence_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_scalar_value_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_vector_value_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_scalar_gradient_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_vector_gradient_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_vector_divergence_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       p2g_fused_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p_fused_timer, comm );
+        outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank,
+                       g2p2g_fused_timer, comm );
 
         stream << std::flush;
     }
@@ -555,16 +561,50 @@ int main( int argc, char* argv[] )
     std::string run_type = "";
     if ( argc > 2 )
         run_type = argv[2];
-    std::vector<int> cells_per_dim = { 16, 32 };
+    std::vector<int> cells_per_dim_per_rank = { 16, 32 };
     std::vector<int> particles_per_cell = { 1, 4 };
     if ( run_type == "large" )
     {
-        cells_per_dim = { 16, 32, 64, 128 };
+        cells_per_dim_per_rank = { 16, 32, 64, 128 };
         particles_per_cell = { 1, 8, 16 };
     }
+
+    // Barier before continuing.
+    MPI_Barrier( MPI_COMM_WORLD );
+
+    // Get comm rank and size;
+    int comm_rank;
+    MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank );
+    int comm_size;
+    MPI_Comm_size( MPI_COMM_WORLD, &comm_size );
+
+    // Get partitioner
+    DimBlockPartitioner<3> partitioner;
+    // Get ranks per dimension
+    std::array<int, 3> ranks_per_dimension =
+        partitioner.ranksPerDimension( MPI_COMM_WORLD, { 0, 0, 0 } );
+
     // Open the output file on rank 0.
     std::fstream file;
-    file.open( filename, std::fstream::out );
+    // Output problem details.
+    if ( 0 == comm_rank )
+    {
+        file.open( filename + "_" + std::to_string( comm_size ),
+                   std::fstream::out );
+        file << "\n";
+        file << "Cabana::Grid Interpolation Performance Benchmark"
+             << "\n";
+        file << "----------------------------------------------"
+             << "\n";
+        file << "MPI Ranks: " << comm_size << "\n";
+        file << "MPI Cartesian Dim Ranks: (" << ranks_per_dimension[0] << ", "
+             << ranks_per_dimension[1] << ", " << ranks_per_dimension[2]
+             << ")\n";
+        file << "----------------------------------------------"
+             << "\n";
+        file << "\n";
+        file << std::flush;
+    }
 
     // Do everything on the default CPU.
     using host_exec_space = Kokkos::DefaultHostExecutionSpace;
@@ -576,11 +616,13 @@ int main( int argc, char* argv[] )
     // Don't run twice on the CPU if only host enabled.
     if ( !std::is_same<device_type, host_device_type>{} )
     {
-        performanceTest<device_type>( file, "device_", cells_per_dim,
-                                      particles_per_cell );
+        performanceTest<device_type>( file, "device_", cells_per_dim_per_rank,
+                                      particles_per_cell, partitioner,
+                                      MPI_COMM_WORLD );
     }
-    performanceTest<host_device_type>( file, "host_", cells_per_dim,
-                                       particles_per_cell );
+    performanceTest<host_device_type>( file, "host_", cells_per_dim_per_rank,
+                                       particles_per_cell, partitioner,
+                                       MPI_COMM_WORLD );
 
     // Close the output file on rank 0.
     file.close();
diff --git a/benchmark/plot/Cabana_BenchmarkPlotUtils.py b/benchmark/plot/Cabana_BenchmarkPlotUtils.py
index faa666925..86478ca0c 100644
--- a/benchmark/plot/Cabana_BenchmarkPlotUtils.py
+++ b/benchmark/plot/Cabana_BenchmarkPlotUtils.py
@@ -19,14 +19,25 @@
 class DataDescription:
     def __init__(self, label):
         #Example: serial_neigh_iteration_3_1
+        #         host_host_halo_gather_0
         self.label = label
         details = label.split("_")
-        self.backend = details[0].strip()
-        self.type = details[1].strip()
-        self.category = details[2].strip()
+        index = 1
+        # MPI has two backends
+        if "host_host" in label or "device_device" in label or "host_device" in label:
+            self.backend = "_".join(details[0:2]).strip()
+            index += 1
+        else:
+            self.backend = details[0].strip()
+        self.type = details[index].strip()
+
+        index += 1
+        self.category = details[index].strip()
         if self.category == "iteration": self.category = "iterate"
+
+        index += 1
         self.params = []
-        for p in details[3:]:
+        for p in details[index:]:
             self.params.append(p.strip())
 
 # Header description for one series of runs with MPI.
@@ -38,22 +49,10 @@ def __init__(self, label):
         details = label.split("_")
         self.backend = "_".join(details[0:2]).strip()
         self.type = details[2].strip()
+        # This is the only difference that still requires this separate class.
         self.category = details[-1].strip()
         self.params = details[3:-1]
 
-# Header description for one series of runs for p2g/g2p.
-class DataDescriptionInterpolation(DataDescription):
-    # Purposely not calling base __init__
-    def __init__(self, label):
-        #Example: device_p2g_scalar_value_16
-        self.label = label
-        details = label.split("_")
-        self.backend = details[0].strip()
-        self.type = "interpolation"
-        self.category = details[1].strip()
-        self.params = ["_".join(details[2:4]).strip()]
-        self.size = details[-1]
-
 # Create a header description to compare results against.
 class ManualDataDescription:
     def __init__(self, backend, type, category, params):
@@ -112,22 +111,10 @@ def __init__(self, description, line):
         self.size = int(float(results[1]))
         self._initTimeResults(results[2:])
 
-# Single p2g/g2p result (single line in results file).
-class DataPointInterpolation(DataPoint):
-    # Purposely not calling base __init__
-    def __init__(self, description, line):
-        # Deep copy necessary because unique parameters are used per result (ppc)
-        self.description = deepcopy(description)
-
-        #ppc min max ave
-        self.line = line
-        results = line.split()
-        self.size = int(float(description.size))
-        self._initTimeResults(results[1:])
-        self.description.params.append(results[0])
-
 # All performance results from multiple files.
 class AllData:
+    mpi = False
+
     def __init__(self, filelist, grid=False):
         self.grid = grid
         self.results = []
@@ -213,13 +200,15 @@ def getAllCategories(self):
 
 # All MPI performance results from multiple files.
 class AllDataMPI(AllData):
+    mpi = True
+
     def _endOfFile(self, l):
         return l >= self.total
 
     def _readFile(self, filename):
         with open(filename) as f:
             txt = f.readlines()
-        size = int(txt[4].split()[-1])
+        size = int(txt[5].split()[-1])
 
         l = 8
         self.total = len(txt[l:])
@@ -237,8 +226,10 @@ def _readFile(self, filename):
             else:
                 l += 1
 
-# All MPI performance results from multiple files.
+# All Grid performance results from multiple files.
 class AllDataGrid(AllData):
+    mpi = True
+
     def _endOfFile(self, l):
         return l > self.total
 
@@ -251,6 +242,7 @@ def _readFile(self, filename):
         while not self._endOfFile(l):
             if self._emptyLine(txt[l]):
                 l += 1
+                print(txt[l])
                 description = DataDescription(txt[l])
             elif self._headerLine(txt[l]):
                 l += 1
@@ -262,16 +254,7 @@ def _readFile(self, filename):
             else:
                 l += 1
 
-# All p2g/g2p performance results from multiple files.
-class AllDataInterpolation(AllData):
-    def _getDescription(self, line):
-        return DataDescriptionInterpolation(line)
-
-    def _getData(self, descr, line):
-        return DataPointInterpolation(descr, line)
-
 # All performance results for a single set of parameters.
-# FIXME: this may need to be sorted for plotting.
 class AllSizesSingleResult:
     def __init__(self, all_data: AllData, descr: ManualDataDescription):
         self.times = np.array([])
@@ -281,6 +264,11 @@ def __init__(self, all_data: AllData, descr: ManualDataDescription):
                 self.sizes = np.append(self.sizes, d.size)
                 self.times = np.append(self.times, d.ave)
 
+    def sort(self):
+        indices = np.argsort(self.sizes)
+        self.sizes = self.sizes[indices]
+        self.times = self.times[indices]
+
     def _compareAll(self, data_description, check):
         if data_description.backend == check.backend and data_description.category == check.category and data_description.type == check.type and data_description.params == check.params:
             return True
@@ -294,22 +282,22 @@ def getData(filelist):
     if "Cabana Comm" in txt:
         return AllDataMPI(filelist)
     # FIXME: Cajita backwards compatibility
-    elif ("Cajita Halo" in txt or "Cajita FFT" in txt or 
-          "Cabana::Grid Halo" in txt or "Cabana::Grid FFT" in txt):
+    elif ("Cajita" in txt or "Cabana::Grid " in txt):
         return AllDataGrid(filelist, grid=True)
-    elif "g2p" in txt:
-        return AllDataInterpolation(filelist, grid=True)
 
     return AllData(filelist)
 
-# Return separate data for the first two files only.
-def getSeparateData(filelist):
+# Return separate data for the first and last N files.
+def getSeparateData(filelist, split_index = -1):
     if len(filelist) < 2:
         exit("Separate data requires at least two files.")
-    split_index = 1
-    for f, fname in enumerate(filelist):
-        if fname.split("_")[:-1] != filelist[0].split("_")[:-1]:
-            split_index = f
+    if len(filelist) % 2 != 0:
+        exit("Cannot compare odd number of files.")
+    # By default (without manual user input) search for a change in file names.
+    if split_index == -1:
+        for f, fname in enumerate(filelist):
+            if fname.split("_")[:-1] != filelist[0].split("_")[:-1]:
+                split_index = f
     data1 = getData(filelist[:split_index])
     data2 = getData(filelist[split_index:])
     return data1, data2
@@ -339,12 +327,11 @@ def getLegend(data: AllData, cpu_name, gpu_name, backend_label):
     if backend_label:
         backends = data.getAllBackends()
         for backend in data.getAllBackends():
-            # FIXME: backwards compatibility
             if "_host" in backend and "host_host" not in backend:
                 legend.append(Line2D([0], [0], color="k", lw=2, linestyle= "-.", label=cpu_name+" CPU"))
-            elif "host" in backend or "serial" in backend or "openmp" in backend:
+            elif "host" in backend:
                 legend.append(Line2D([0], [0], color="k", lw=2, linestyle= "--", label=cpu_name+" CPU"))
-            elif "device" in backend or "cuda" in backend or "hip" in backend:
+            elif "device" in backend:
                 legend.append(Line2D([0], [0], color="k", lw=2, linestyle="-", label=gpu_name+" GPU"))
 
     colors = getColors(data)
@@ -357,16 +344,15 @@ def getLegend(data: AllData, cpu_name, gpu_name, backend_label):
 def plotResults(ax, x, y, backend, color):
     linewidth = 2
     dash = "-"
-    # FIXME: backwards compatibility
     if "_host" in backend and "host_host" not in backend:
         dash = "-."
-    elif "host" in backend or "serial" in backend or "openmp" in backend:
+    elif "host" in backend:
         dash = "--"
 
     ax.plot(x, y, color=color, lw=linewidth, marker='o', linestyle=dash)
 
 # Add plot labels and show/save.
-def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_name="", gpu_name=""):
+def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_name="", gpu_name="", filename="Cabana_Benchmark.png", dpi=0):
     if speedup:
         min_max = data.minMaxSize()
         if data.grid: min_max = min_max**3
@@ -378,8 +364,11 @@ def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_na
         ax.set_ylabel("Speedup")
     else:
         ax.set_ylabel("Time (seconds)")
-    if data.grid:
-        ax.set_xlabel("Number of grid points")
+
+    if data.grid: # Always uses MPI
+        ax.set_xlabel("Number of grid points per rank")
+    elif data.mpi:
+        ax.set_xlabel("Number of particles per rank")
     else:
         ax.set_xlabel("Number of particles")
 
@@ -389,5 +378,8 @@ def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_na
     ax.set_yscale('log')
 
     fig.tight_layout()
-    plt.show()
-    #plt.savefig("Cabana_Benchmark.png", dpi=300)
+
+    if dpi:
+        plt.savefig(filename, dpi=dpi)
+    else:
+        plt.show()
diff --git a/benchmark/plot/Cabana_PlotBenchmark.py b/benchmark/plot/Cabana_PlotBenchmark.py
index b7473a34a..5e50aefe3 100644
--- a/benchmark/plot/Cabana_PlotBenchmark.py
+++ b/benchmark/plot/Cabana_PlotBenchmark.py
@@ -15,7 +15,7 @@
 from Cabana_BenchmarkPlotUtils import *
 
 # Plot all results in a list of files.
-def plotAll(ax, data):
+def plotAll(ax, data, sort=False):
     color_dict = getColors(data)
     for backend in data.getAllBackends():
         for cat in data.getAllCategories():
@@ -23,6 +23,8 @@ def plotAll(ax, data):
                 for param in data.getAllParams():
                     desc = ManualDataDescription(backend, type, cat, param)
                     result = AllSizesSingleResult(data, desc)
+                    if sort:
+                        result.sort()
 
                     sizes = scaleSizes(result.sizes, data.grid)
                     plotResults(ax, sizes, result.times, backend, color_dict[cat])
@@ -45,8 +47,30 @@ def plotCompareHostDevice(ax, data, compare="host"):
                     num_2 = len(result2.times)
                     max = num_1 if num_1 < num_2 else num_2
 
-                    sizes = scaleSizes(result.sizes, data.grid)
-                    speedup = result2.times / result.times
+                    sizes = scaleSizes(result.sizes[:max], data.grid)
+                    speedup = result2.times[:max] / result.times[:max]
+                    plotResults(ax, sizes, speedup, backend, color_dict[cat])
+    return True
+
+# Compare host and device results from a list of files.
+def plotCompare(ax, data, compare=["buffer"]):
+    color_dict = getColors(data)
+    for backend in data.getAllBackends():
+        if "host" in backend: continue
+        for cat in data.getAllCategories():
+            for type in data.getAllTypes():
+                for param in data.getAllParams():
+                    desc = ManualDataDescription(backend, type, cat, compare+param)
+                    result = AllSizesSingleResult(data, desc)
+                    desc2 = ManualDataDescription(backend, type, cat, param)
+                    result2 = AllSizesSingleResult(data, desc2)
+
+                    num_1 = len(result.times)
+                    num_2 = len(result2.times)
+                    max = num_1 if num_1 < num_2 else num_2
+
+                    sizes = scaleSizes(result.sizes[:max], data.grid)
+                    speedup = result2.times[:max] / result.times[:max]
                     plotResults(ax, sizes, speedup, backend, color_dict[cat])
     return True
 
@@ -91,11 +115,12 @@ def plotCompareFiles(ax, data1, data2, ignore_backend=[]):
     data = getData(filelist)
 
     speedup = plotAll(ax1, data)
-    #speedup = plotCompareHostDevice(ax1, data, "serial")
+    #speedup = plotCompareHostDevice(ax1, data)
+    #speedup = plotCompare(ax1, data)
 
     #data, data_f2 = getSeparateData(filelist)
-    #speedup = plotCompareFiles(ax1, data, data_f2, ["cuda_host", "cudauvm_cudauvm", "hip_host"])
+    #speedup = plotCompareFiles(ax1, data, data_f2)
     ###
 
     createPlot(fig1, ax1, data,
-               speedup=speedup, backend_label=True)#, cpu_name="POWER9", gpu_name="V100")
+               speedup=speedup, backend_label=True)# cpu_name="EPYC", gpu_name="MI250X")