diff --git a/benchmark/core/Cabana_BinSortPerformance.cpp b/benchmark/core/Cabana_BinSortPerformance.cpp index 796a74e2d..b508f0728 100644 --- a/benchmark/core/Cabana_BinSortPerformance.cpp +++ b/benchmark/core/Cabana_BinSortPerformance.cpp @@ -203,10 +203,12 @@ int main( int argc, char* argv[] ) if ( argc > 2 ) run_type = argv[2]; std::vector problem_sizes = { 1000, 10000 }; + std::vector host_problem_sizes = problem_sizes; std::vector num_bins = { 10, 100, 1000, 10000 }; if ( run_type == "large" ) { problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 }; + host_problem_sizes = { 1000, 10000, 100000 }; num_bins = { 10, 100, 1000, 10000, 100000, 1000000, 10000000 }; } @@ -227,7 +229,8 @@ int main( int argc, char* argv[] ) performanceTest( file, "device_", problem_sizes, num_bins ); } - performanceTest( file, "host_", problem_sizes, num_bins ); + performanceTest( file, "host_", host_problem_sizes, + num_bins ); // Close the output file on rank 0. file.close(); diff --git a/benchmark/core/Cabana_LinkedCellPerformance.cpp b/benchmark/core/Cabana_LinkedCellPerformance.cpp index 4d8db2dfa..f513c5baa 100644 --- a/benchmark/core/Cabana_LinkedCellPerformance.cpp +++ b/benchmark/core/Cabana_LinkedCellPerformance.cpp @@ -190,10 +190,13 @@ int main( int argc, char* argv[] ) if ( argc > 2 ) run_type = argv[2]; std::vector problem_sizes = { 100, 1000 }; + std::vector host_problem_sizes = problem_sizes; std::vector cutoff_ratios = { 3.0, 4.0 }; if ( run_type == "large" ) - problem_sizes = { 1000, 10000, 100000, 1000000 }; - + { + problem_sizes = { 1000, 10000, 100000, 1000000, 10000000, 100000000 }; + host_problem_sizes = { 1000, 10000, 100000 }; + } // Open the output file on rank 0. std::fstream file; file.open( filename, std::fstream::out ); @@ -211,7 +214,7 @@ int main( int argc, char* argv[] ) performanceTest( file, "device_", problem_sizes, cutoff_ratios ); } - performanceTest( file, "host_", problem_sizes, + performanceTest( file, "host_", host_problem_sizes, cutoff_ratios ); // Close the output file on rank 0. diff --git a/benchmark/core/Cabana_NeighborArborXPerformance.cpp b/benchmark/core/Cabana_NeighborArborXPerformance.cpp index e7647c417..b22201d6c 100644 --- a/benchmark/core/Cabana_NeighborArborXPerformance.cpp +++ b/benchmark/core/Cabana_NeighborArborXPerformance.cpp @@ -127,11 +127,49 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, // Create the neighbor list. double cutoff = cutoff_ratios[c]; create_timer.start( pid ); + + // Note: this needs to match the neighbor function used below + // (only necessary for the neighbor statistics printing). + using neigh_type = + Cabana::Experimental::Dense; + auto const nlist = Cabana::Experimental::make2DNeighborList( ListTag{}, Cabana::slice<0>( aosoas[p], "position" ), 0, num_p, cutoff ); create_timer.stop( pid ); + // Print neighbor statistics once per system. + if ( t == 0 ) + { + std::size_t max_neigh; + Kokkos::Max max_reducer( max_neigh ); + std::size_t min_neigh; + Kokkos::Min min_reducer( min_neigh ); + std::size_t total_neigh; + Kokkos::Sum total_reducer( total_neigh ); + Kokkos::parallel_reduce( + "Cabana::Benchmark::countNeighbors", policy, + KOKKOS_LAMBDA( const int p, std::size_t& min, + std::size_t& max, std::size_t& sum ) { + auto const val = + Cabana::NeighborList::numNeighbor( + nlist, p ); + if ( val < min ) + min = val; + if ( val > max ) + max = val; + sum += val; + }, + min_reducer, max_reducer, total_reducer ); + Kokkos::fence(); + std::cout << "List min neighbors: " << min_neigh + << std::endl; + std::cout << "List max neighbors: " << max_neigh + << std::endl; + std::cout << "List avg neighbors: " << total_neigh / num_p + << std::endl; + std::cout << std::endl; + } // Iterate through the neighbor list. iteration_timer.start( pid ); Cabana::neighbor_parallel_for( policy, count_op, nlist, @@ -175,10 +213,12 @@ int main( int argc, char* argv[] ) if ( argc > 2 ) run_type = argv[2]; std::vector problem_sizes = { 100, 1000 }; + std::vector host_problem_sizes = problem_sizes; std::vector cutoff_ratios = { 2.0, 3.0 }; if ( run_type == "large" ) { - problem_sizes = { 1000, 10000, 100000, 1000000 }; + problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 }; + host_problem_sizes = { 1000, 10000, 100000 }; cutoff_ratios = { 3.0, 4.0, 5.0 }; } @@ -199,7 +239,7 @@ int main( int argc, char* argv[] ) performanceTest( file, "device_", problem_sizes, cutoff_ratios ); } - performanceTest( file, "host_", problem_sizes, + performanceTest( file, "host_", host_problem_sizes, cutoff_ratios ); // Close the output file on rank 0. diff --git a/benchmark/core/Cabana_NeighborVerletPerformance.cpp b/benchmark/core/Cabana_NeighborVerletPerformance.cpp index 907fb3b73..310fd0ddb 100644 --- a/benchmark/core/Cabana_NeighborVerletPerformance.cpp +++ b/benchmark/core/Cabana_NeighborVerletPerformance.cpp @@ -39,6 +39,8 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, using LayoutTag = Cabana::VerletLayout2D; using BuildTag = Cabana::TeamVectorOpTag; using IterTag = Cabana::SerialOpTag; + using neigh_type = + Cabana::VerletList; // Declare problem sizes. int num_problem_size = problem_sizes.size(); @@ -139,11 +141,9 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, // Create the neighbor list. double cutoff = cutoff_ratios[c0]; create_timer.start( pid ); - Cabana::VerletList - nlist( Cabana::slice<0>( aosoas[p], "position" ), 0, - num_p, cutoff, cell_ratios[c1], grid_min, - grid_max ); + neigh_type nlist( Cabana::slice<0>( aosoas[p], "position" ), + 0, num_p, cutoff, cell_ratios[c1], + grid_min, grid_max ); create_timer.stop( pid ); // Iterate through the neighbor list. @@ -157,39 +157,31 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, // Print neighbor statistics once per system. if ( t == 0 ) { - Kokkos::MinMaxScalar min_max; - Kokkos::MinMax reducer( min_max ); - auto const& nlist_data_count = - nlist._data.counts; // capture just the view + std::size_t max_neigh; + Kokkos::Max max_reducer( max_neigh ); + std::size_t min_neigh; + Kokkos::Min min_reducer( min_neigh ); + std::size_t total_neigh; + Kokkos::Sum total_reducer( total_neigh ); Kokkos::parallel_reduce( - "Cabana::countMinMax", policy, - KOKKOS_LAMBDA( - const int p, - Kokkos::MinMaxScalar& local_minmax ) { - auto const val = nlist_data_count( p ); - if ( val < local_minmax.min_val ) - { - local_minmax.min_val = val; - } - if ( val > local_minmax.max_val ) - { - local_minmax.max_val = val; - } + "Cabana::Benchmark::countNeighbors", policy, + KOKKOS_LAMBDA( const int p, std::size_t& min, + std::size_t& max, + std::size_t& sum ) { + auto const val = Cabana::NeighborList< + neigh_type>::numNeighbor( nlist, p ); + if ( val < min ) + min = val; + if ( val > max ) + max = val; + sum += val; }, - reducer ); + min_reducer, max_reducer, total_reducer ); Kokkos::fence(); - std::cout << "List min neighbors: " << min_max.min_val + std::cout << "List min neighbors: " << min_neigh << std::endl; - std::cout << "List max neighbors: " << min_max.max_val + std::cout << "List max neighbors: " << max_neigh << std::endl; - int total_neigh = 0; - Kokkos::parallel_reduce( - "Cabana::countSum", policy, - KOKKOS_LAMBDA( const int p, int& nsum ) { - nsum += nlist._data.counts( p ); - }, - total_neigh ); - Kokkos::fence(); std::cout << "List avg neighbors: " << total_neigh / num_p << std::endl; @@ -232,11 +224,13 @@ int main( int argc, char* argv[] ) if ( argc > 2 ) run_type = argv[2]; std::vector problem_sizes = { 100, 1000 }; + std::vector host_problem_sizes = problem_sizes; std::vector cutoff_ratios = { 2.0, 3.0 }; std::vector cell_ratios = { 1.0 }; if ( run_type == "large" ) { - problem_sizes = { 1000, 10000, 100000, 1000000 }; + problem_sizes = { 1000, 10000, 100000, 1000000, 10000000 }; + host_problem_sizes = { 1000, 10000, 100000 }; cutoff_ratios = { 3.0, 4.0, 5.0 }; cell_ratios = { 1.0 }; } @@ -258,7 +252,7 @@ int main( int argc, char* argv[] ) performanceTest( file, "device_", problem_sizes, cutoff_ratios, cell_ratios ); } - performanceTest( file, "host_", problem_sizes, + performanceTest( file, "host_", host_problem_sizes, cutoff_ratios, cell_ratios ); // Close the output file on rank 0. diff --git a/benchmark/grid/Cabana_Grid_HaloPerformance.cpp b/benchmark/grid/Cabana_Grid_HaloPerformance.cpp index 31a922159..fb8a862ee 100644 --- a/benchmark/grid/Cabana_Grid_HaloPerformance.cpp +++ b/benchmark/grid/Cabana_Grid_HaloPerformance.cpp @@ -214,13 +214,13 @@ int main( int argc, char* argv[] ) // Don't run twice on the CPU if only host enabled. if ( !std::is_same{} ) { - performanceTest( file, partitioner, - grid_sizes_per_dim_per_rank, "device_", - halo_widths, MPI_COMM_WORLD ); + performanceTest( + file, partitioner, grid_sizes_per_dim_per_rank, "device_device_", + halo_widths, MPI_COMM_WORLD ); } - performanceTest( file, partitioner, - grid_sizes_per_dim_per_rank, "host_", - halo_widths, MPI_COMM_WORLD ); + performanceTest( + file, partitioner, grid_sizes_per_dim_per_rank, "host_host_", + halo_widths, MPI_COMM_WORLD ); // Finalize Kokkos::finalize(); diff --git a/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp b/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp index 59c19d20d..b649123dd 100644 --- a/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp +++ b/benchmark/grid/Cabana_Grid_InterpolationPerformance.cpp @@ -177,8 +177,9 @@ void g2p2g( const ScalarValueP2GType& scalar_p2g, const Coordinates& points, // Performance test. template void performanceTest( std::ostream& stream, const std::string& test_prefix, - std::vector cells_per_dim, - std::vector particles_per_cell ) + std::vector cells_per_dim_per_rank, + std::vector particles_per_cell, + const DimBlockPartitioner<3> partitioner, MPI_Comm comm ) { using exec_space = typename Device::execution_space; using memory_space = typename Device::memory_space; @@ -192,7 +193,7 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, std::array is_dim_periodic = { false, false, false }; // System sizes - int num_problem_size = cells_per_dim.size(); + int num_problem_size = cells_per_dim_per_rank.size(); int num_particles_per_cell = particles_per_cell.size(); // Define the particle types. @@ -201,7 +202,6 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, using aosoa_type = Cabana::AoSoA; // Define properties that do not depend on mesh size. - DimBlockPartitioner<3> partitioner; int halo_width = 1; uint64_t seed = 1938347; @@ -298,9 +298,15 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, for ( int n = 0; n < num_problem_size; ++n ) { // Create the global grid - double cell_size = 1.0 / cells_per_dim[n]; + auto ranks_per_dim = + partitioner.ranksPerDimension( comm, { 0, 0, 0 } ); + std::array cells_per_dim; + for ( int d = 0; d < 3; ++d ) + { + cells_per_dim[d] = cells_per_dim_per_rank[n] * ranks_per_dim[d]; + } auto global_mesh = createUniformGlobalMesh( - global_low_corner, global_high_corner, cell_size ); + global_low_corner, global_high_corner, cells_per_dim ); auto global_grid = createGlobalGrid( MPI_COMM_WORLD, global_mesh, is_dim_periodic, partitioner ); @@ -500,32 +506,32 @@ void performanceTest( std::ostream& stream, const std::string& test_prefix, } // Output results - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_scalar_value_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_vector_value_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_scalar_gradient_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_vector_divergence_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_tensor_divergence_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_scalar_value_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_vector_value_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_scalar_gradient_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_vector_gradient_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_vector_divergence_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - p2g_fused_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p_fused_timer ); - outputResults( stream, "grid_size_per_dim", cells_per_dim, - g2p2g_fused_timer ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_scalar_value_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_vector_value_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_scalar_gradient_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_vector_divergence_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_tensor_divergence_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_scalar_value_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_vector_value_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_scalar_gradient_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_vector_gradient_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_vector_divergence_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + p2g_fused_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p_fused_timer, comm ); + outputResults( stream, "grid_size_per_dim", cells_per_dim_per_rank, + g2p2g_fused_timer, comm ); stream << std::flush; } @@ -555,16 +561,50 @@ int main( int argc, char* argv[] ) std::string run_type = ""; if ( argc > 2 ) run_type = argv[2]; - std::vector cells_per_dim = { 16, 32 }; + std::vector cells_per_dim_per_rank = { 16, 32 }; std::vector particles_per_cell = { 1, 4 }; if ( run_type == "large" ) { - cells_per_dim = { 16, 32, 64, 128 }; + cells_per_dim_per_rank = { 16, 32, 64, 128 }; particles_per_cell = { 1, 8, 16 }; } + + // Barier before continuing. + MPI_Barrier( MPI_COMM_WORLD ); + + // Get comm rank and size; + int comm_rank; + MPI_Comm_rank( MPI_COMM_WORLD, &comm_rank ); + int comm_size; + MPI_Comm_size( MPI_COMM_WORLD, &comm_size ); + + // Get partitioner + DimBlockPartitioner<3> partitioner; + // Get ranks per dimension + std::array ranks_per_dimension = + partitioner.ranksPerDimension( MPI_COMM_WORLD, { 0, 0, 0 } ); + // Open the output file on rank 0. std::fstream file; - file.open( filename, std::fstream::out ); + // Output problem details. + if ( 0 == comm_rank ) + { + file.open( filename + "_" + std::to_string( comm_size ), + std::fstream::out ); + file << "\n"; + file << "Cabana::Grid Interpolation Performance Benchmark" + << "\n"; + file << "----------------------------------------------" + << "\n"; + file << "MPI Ranks: " << comm_size << "\n"; + file << "MPI Cartesian Dim Ranks: (" << ranks_per_dimension[0] << ", " + << ranks_per_dimension[1] << ", " << ranks_per_dimension[2] + << ")\n"; + file << "----------------------------------------------" + << "\n"; + file << "\n"; + file << std::flush; + } // Do everything on the default CPU. using host_exec_space = Kokkos::DefaultHostExecutionSpace; @@ -576,11 +616,13 @@ int main( int argc, char* argv[] ) // Don't run twice on the CPU if only host enabled. if ( !std::is_same{} ) { - performanceTest( file, "device_", cells_per_dim, - particles_per_cell ); + performanceTest( file, "device_", cells_per_dim_per_rank, + particles_per_cell, partitioner, + MPI_COMM_WORLD ); } - performanceTest( file, "host_", cells_per_dim, - particles_per_cell ); + performanceTest( file, "host_", cells_per_dim_per_rank, + particles_per_cell, partitioner, + MPI_COMM_WORLD ); // Close the output file on rank 0. file.close(); diff --git a/benchmark/plot/Cabana_BenchmarkPlotUtils.py b/benchmark/plot/Cabana_BenchmarkPlotUtils.py index faa666925..86478ca0c 100644 --- a/benchmark/plot/Cabana_BenchmarkPlotUtils.py +++ b/benchmark/plot/Cabana_BenchmarkPlotUtils.py @@ -19,14 +19,25 @@ class DataDescription: def __init__(self, label): #Example: serial_neigh_iteration_3_1 + # host_host_halo_gather_0 self.label = label details = label.split("_") - self.backend = details[0].strip() - self.type = details[1].strip() - self.category = details[2].strip() + index = 1 + # MPI has two backends + if "host_host" in label or "device_device" in label or "host_device" in label: + self.backend = "_".join(details[0:2]).strip() + index += 1 + else: + self.backend = details[0].strip() + self.type = details[index].strip() + + index += 1 + self.category = details[index].strip() if self.category == "iteration": self.category = "iterate" + + index += 1 self.params = [] - for p in details[3:]: + for p in details[index:]: self.params.append(p.strip()) # Header description for one series of runs with MPI. @@ -38,22 +49,10 @@ def __init__(self, label): details = label.split("_") self.backend = "_".join(details[0:2]).strip() self.type = details[2].strip() + # This is the only difference that still requires this separate class. self.category = details[-1].strip() self.params = details[3:-1] -# Header description for one series of runs for p2g/g2p. -class DataDescriptionInterpolation(DataDescription): - # Purposely not calling base __init__ - def __init__(self, label): - #Example: device_p2g_scalar_value_16 - self.label = label - details = label.split("_") - self.backend = details[0].strip() - self.type = "interpolation" - self.category = details[1].strip() - self.params = ["_".join(details[2:4]).strip()] - self.size = details[-1] - # Create a header description to compare results against. class ManualDataDescription: def __init__(self, backend, type, category, params): @@ -112,22 +111,10 @@ def __init__(self, description, line): self.size = int(float(results[1])) self._initTimeResults(results[2:]) -# Single p2g/g2p result (single line in results file). -class DataPointInterpolation(DataPoint): - # Purposely not calling base __init__ - def __init__(self, description, line): - # Deep copy necessary because unique parameters are used per result (ppc) - self.description = deepcopy(description) - - #ppc min max ave - self.line = line - results = line.split() - self.size = int(float(description.size)) - self._initTimeResults(results[1:]) - self.description.params.append(results[0]) - # All performance results from multiple files. class AllData: + mpi = False + def __init__(self, filelist, grid=False): self.grid = grid self.results = [] @@ -213,13 +200,15 @@ def getAllCategories(self): # All MPI performance results from multiple files. class AllDataMPI(AllData): + mpi = True + def _endOfFile(self, l): return l >= self.total def _readFile(self, filename): with open(filename) as f: txt = f.readlines() - size = int(txt[4].split()[-1]) + size = int(txt[5].split()[-1]) l = 8 self.total = len(txt[l:]) @@ -237,8 +226,10 @@ def _readFile(self, filename): else: l += 1 -# All MPI performance results from multiple files. +# All Grid performance results from multiple files. class AllDataGrid(AllData): + mpi = True + def _endOfFile(self, l): return l > self.total @@ -251,6 +242,7 @@ def _readFile(self, filename): while not self._endOfFile(l): if self._emptyLine(txt[l]): l += 1 + print(txt[l]) description = DataDescription(txt[l]) elif self._headerLine(txt[l]): l += 1 @@ -262,16 +254,7 @@ def _readFile(self, filename): else: l += 1 -# All p2g/g2p performance results from multiple files. -class AllDataInterpolation(AllData): - def _getDescription(self, line): - return DataDescriptionInterpolation(line) - - def _getData(self, descr, line): - return DataPointInterpolation(descr, line) - # All performance results for a single set of parameters. -# FIXME: this may need to be sorted for plotting. class AllSizesSingleResult: def __init__(self, all_data: AllData, descr: ManualDataDescription): self.times = np.array([]) @@ -281,6 +264,11 @@ def __init__(self, all_data: AllData, descr: ManualDataDescription): self.sizes = np.append(self.sizes, d.size) self.times = np.append(self.times, d.ave) + def sort(self): + indices = np.argsort(self.sizes) + self.sizes = self.sizes[indices] + self.times = self.times[indices] + def _compareAll(self, data_description, check): if data_description.backend == check.backend and data_description.category == check.category and data_description.type == check.type and data_description.params == check.params: return True @@ -294,22 +282,22 @@ def getData(filelist): if "Cabana Comm" in txt: return AllDataMPI(filelist) # FIXME: Cajita backwards compatibility - elif ("Cajita Halo" in txt or "Cajita FFT" in txt or - "Cabana::Grid Halo" in txt or "Cabana::Grid FFT" in txt): + elif ("Cajita" in txt or "Cabana::Grid " in txt): return AllDataGrid(filelist, grid=True) - elif "g2p" in txt: - return AllDataInterpolation(filelist, grid=True) return AllData(filelist) -# Return separate data for the first two files only. -def getSeparateData(filelist): +# Return separate data for the first and last N files. +def getSeparateData(filelist, split_index = -1): if len(filelist) < 2: exit("Separate data requires at least two files.") - split_index = 1 - for f, fname in enumerate(filelist): - if fname.split("_")[:-1] != filelist[0].split("_")[:-1]: - split_index = f + if len(filelist) % 2 != 0: + exit("Cannot compare odd number of files.") + # By default (without manual user input) search for a change in file names. + if split_index == -1: + for f, fname in enumerate(filelist): + if fname.split("_")[:-1] != filelist[0].split("_")[:-1]: + split_index = f data1 = getData(filelist[:split_index]) data2 = getData(filelist[split_index:]) return data1, data2 @@ -339,12 +327,11 @@ def getLegend(data: AllData, cpu_name, gpu_name, backend_label): if backend_label: backends = data.getAllBackends() for backend in data.getAllBackends(): - # FIXME: backwards compatibility if "_host" in backend and "host_host" not in backend: legend.append(Line2D([0], [0], color="k", lw=2, linestyle= "-.", label=cpu_name+" CPU")) - elif "host" in backend or "serial" in backend or "openmp" in backend: + elif "host" in backend: legend.append(Line2D([0], [0], color="k", lw=2, linestyle= "--", label=cpu_name+" CPU")) - elif "device" in backend or "cuda" in backend or "hip" in backend: + elif "device" in backend: legend.append(Line2D([0], [0], color="k", lw=2, linestyle="-", label=gpu_name+" GPU")) colors = getColors(data) @@ -357,16 +344,15 @@ def getLegend(data: AllData, cpu_name, gpu_name, backend_label): def plotResults(ax, x, y, backend, color): linewidth = 2 dash = "-" - # FIXME: backwards compatibility if "_host" in backend and "host_host" not in backend: dash = "-." - elif "host" in backend or "serial" in backend or "openmp" in backend: + elif "host" in backend: dash = "--" ax.plot(x, y, color=color, lw=linewidth, marker='o', linestyle=dash) # Add plot labels and show/save. -def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_name="", gpu_name=""): +def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_name="", gpu_name="", filename="Cabana_Benchmark.png", dpi=0): if speedup: min_max = data.minMaxSize() if data.grid: min_max = min_max**3 @@ -378,8 +364,11 @@ def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_na ax.set_ylabel("Speedup") else: ax.set_ylabel("Time (seconds)") - if data.grid: - ax.set_xlabel("Number of grid points") + + if data.grid: # Always uses MPI + ax.set_xlabel("Number of grid points per rank") + elif data.mpi: + ax.set_xlabel("Number of particles per rank") else: ax.set_xlabel("Number of particles") @@ -389,5 +378,8 @@ def createPlot(fig, ax, data: AllData, speedup=False, backend_label=True, cpu_na ax.set_yscale('log') fig.tight_layout() - plt.show() - #plt.savefig("Cabana_Benchmark.png", dpi=300) + + if dpi: + plt.savefig(filename, dpi=dpi) + else: + plt.show() diff --git a/benchmark/plot/Cabana_PlotBenchmark.py b/benchmark/plot/Cabana_PlotBenchmark.py index b7473a34a..5e50aefe3 100644 --- a/benchmark/plot/Cabana_PlotBenchmark.py +++ b/benchmark/plot/Cabana_PlotBenchmark.py @@ -15,7 +15,7 @@ from Cabana_BenchmarkPlotUtils import * # Plot all results in a list of files. -def plotAll(ax, data): +def plotAll(ax, data, sort=False): color_dict = getColors(data) for backend in data.getAllBackends(): for cat in data.getAllCategories(): @@ -23,6 +23,8 @@ def plotAll(ax, data): for param in data.getAllParams(): desc = ManualDataDescription(backend, type, cat, param) result = AllSizesSingleResult(data, desc) + if sort: + result.sort() sizes = scaleSizes(result.sizes, data.grid) plotResults(ax, sizes, result.times, backend, color_dict[cat]) @@ -45,8 +47,30 @@ def plotCompareHostDevice(ax, data, compare="host"): num_2 = len(result2.times) max = num_1 if num_1 < num_2 else num_2 - sizes = scaleSizes(result.sizes, data.grid) - speedup = result2.times / result.times + sizes = scaleSizes(result.sizes[:max], data.grid) + speedup = result2.times[:max] / result.times[:max] + plotResults(ax, sizes, speedup, backend, color_dict[cat]) + return True + +# Compare host and device results from a list of files. +def plotCompare(ax, data, compare=["buffer"]): + color_dict = getColors(data) + for backend in data.getAllBackends(): + if "host" in backend: continue + for cat in data.getAllCategories(): + for type in data.getAllTypes(): + for param in data.getAllParams(): + desc = ManualDataDescription(backend, type, cat, compare+param) + result = AllSizesSingleResult(data, desc) + desc2 = ManualDataDescription(backend, type, cat, param) + result2 = AllSizesSingleResult(data, desc2) + + num_1 = len(result.times) + num_2 = len(result2.times) + max = num_1 if num_1 < num_2 else num_2 + + sizes = scaleSizes(result.sizes[:max], data.grid) + speedup = result2.times[:max] / result.times[:max] plotResults(ax, sizes, speedup, backend, color_dict[cat]) return True @@ -91,11 +115,12 @@ def plotCompareFiles(ax, data1, data2, ignore_backend=[]): data = getData(filelist) speedup = plotAll(ax1, data) - #speedup = plotCompareHostDevice(ax1, data, "serial") + #speedup = plotCompareHostDevice(ax1, data) + #speedup = plotCompare(ax1, data) #data, data_f2 = getSeparateData(filelist) - #speedup = plotCompareFiles(ax1, data, data_f2, ["cuda_host", "cudauvm_cudauvm", "hip_host"]) + #speedup = plotCompareFiles(ax1, data, data_f2) ### createPlot(fig1, ax1, data, - speedup=speedup, backend_label=True)#, cpu_name="POWER9", gpu_name="V100") + speedup=speedup, backend_label=True)# cpu_name="EPYC", gpu_name="MI250X")