Open
Description
std::thread io_thread([=]() { do_my_io(); });
Kokkos::parallel_for(N, functor1);
Kokkos::deep_copy(host, device);
MPI(host)
Kokkos::parallel_for(N, functor2);
io_thread.join();
- You need more concurrency -> parallelize within a point
- You probably should launch elements with same number of points together in a kernel
- You need to look into templating on number of points -> reduce cost of accessing element
std::array<int,5> num_elements{n1,n2,n3,n4,n5};
std::array<int,5> team_size{1,8,27,64,125};
for(int size = 0; size<5; size++) {
int vector_size = // depends on kernel - how much concurrency per point
// maybe not do this for team_size 1? or group multiple team_size 1 things together
// potentially use multiple Kokkos instances (partition_instance) i.e. CUDA streams, one per size
parallel_for(TeamPolicy(num_elements[size], team_size[size], vector_size), KOKKOS_LAMBDA(const team_handle_type& team) {
int element = element_map(size, team.league_rank());
parallel_for(TeamThreadMDRange(team, size+1, size+1, size+1), [&](int i0, int i1, int i2) {
parallel_for(ThreadVectorRange(team, ConcurrencyPerPoint), [&](int k) {
elements(element).data(i0,i1,i2,k) = ...
});
parallel_for(ThreadVectorRange(team, ConcurrencyPerPoint), [&](int k) {
elements(element).data(i0,i1,i2,k) = ...
});
parallel_for(ThreadVectorRange(team, ConcurrencyPerPoint), [&](int k) {
elements(element).data(i0,i1,i2,k) = ...
});
});
});
}
Metadata
Metadata
Assignees
Labels
No labels