diff --git a/core/distributed/preconditioner/bddc.cpp b/core/distributed/preconditioner/bddc.cpp index cb801120a4d..9d765d82498 100644 --- a/core/distributed/preconditioner/bddc.cpp +++ b/core/distributed/preconditioner/bddc.cpp @@ -3,6 +3,7 @@ // SPDX-License-Identifier: BSD-3-Clause #include +#include #include #include #include @@ -24,12 +25,15 @@ #include #include #include +#include #include #include #include #include #include +#include "parmetis.h" + namespace gko { namespace experimental { @@ -70,8 +74,9 @@ void Bddc::generate_interfaces() std::shared_ptr< const gko::experimental::distributed::Partition> - partition = share( - clone(exec, global_system_matrix_->get_row_partition().get())); + partition = + share(clone(exec->get_master(), + global_system_matrix_->get_row_partition().get())); // Find owning and non wonind interface idxs, count how many are shared with // each rank. @@ -230,6 +235,7 @@ void Bddc::generate_interfaces() // Generate local matrix data and communication pattern auto mat_data = global_system_matrix_->get_matrix_data().copy_to_host(); + mat_data.remove_zeros(); mat_data.sort_row_major(); matrix_data local_data(dim<2>(cnt, cnt)); @@ -262,67 +268,68 @@ void Bddc::generate_interfaces() std::vector> interface_dofs; std::vector> interface_dof_ranks; for (size_type i = 0; i < n_edges; i++) { - interface_dofs.emplace_back(edge_dofs[i]); - interface_dof_ranks.emplace_back(edge_ranks[i]); - /* auto edge = edge_dofs[i]; */ - /* auto ranks = edge_ranks[i]; */ - /* auto esize = edge.size(); */ - /* auto ematrix = local->create_submatrix(span{start, start + esize}, */ - /* span{start, start + esize}); - */ - /* ValueType min_val = one(); */ - /* IndexType min_idx = -1; */ - /* IndexType min_component = -1; */ - /* std::vector edge_map(esize, -1); */ - /* const auto row_ptrs = ematrix->get_const_row_ptrs(); */ - /* const auto col_idxs = ematrix->get_const_col_idxs(); */ - /* const auto vals = ematrix->get_const_values(); */ - /* std::vector component_vec{}; */ - /* int num_corners = 0; */ - /* for (size_type j = 0; j < esize; j++) { */ - /* if (edge_map[j] == -1) { */ - /* std::set work_set{}; */ - /* std::set component{}; */ - /* for (IndexType idx = row_ptrs[j]; idx < row_ptrs[j + 1]; */ - /* idx++) { */ - /* if (col_idxs[idx] == j) { */ - /* if (std::abs(vals[idx]) < std::abs(min_val)) { */ - /* min_val = vals[idx]; */ - /* min_idx = j; */ - /* min_component = connected_components; */ - /* } */ - /* } */ - /* work_set.insert(col_idxs[idx]); */ - /* edge_map[col_idxs[idx]] = connected_components; */ - /* } */ - /* while (!work_set.empty()) { */ - /* auto current = *work_set.begin(); */ - /* work_set.erase(current); */ - /* component.insert(edge[current]); */ - /* for (IndexType idx = row_ptrs[current]; */ - /* idx < row_ptrs[current + 1]; idx++) { */ - /* if (edge_map[col_idxs[idx]] == -1) { */ - /* work_set.insert(col_idxs[idx]); */ - /* edge_map[col_idxs[idx]] = connected_components; - */ - /* } */ - /* } */ - /* } */ - /* component_vec.clear(); */ - /* for (auto const& idx : component) { */ - /* component_vec.emplace_back(idx); */ - /* } */ - /* if (component.size() == 1) { */ - /* num_corners++; */ - /* } */ - /* if (component_vec.size() > 0) { */ - /* interface_dofs.emplace_back(component_vec); */ - /* interface_dof_ranks.emplace_back(ranks); */ - /* connected_components++; */ - /* } */ - /* } */ - /* } */ - /* start += esize; */ + if (parameters_.connected_components_analysis) { + auto edge = edge_dofs[i]; + auto ranks = edge_ranks[i]; + auto esize = edge.size(); + auto ematrix = local->create_submatrix(span{start, start + esize}, + span{start, start + esize}); + ValueType min_val = one(); + IndexType min_idx = -1; + IndexType min_component = -1; + std::vector edge_map(esize, -1); + const auto row_ptrs = ematrix->get_const_row_ptrs(); + const auto col_idxs = ematrix->get_const_col_idxs(); + const auto vals = ematrix->get_const_values(); + std::vector component_vec{}; + int num_corners = 0; + for (size_type j = 0; j < esize; j++) { + if (edge_map[j] == -1) { + std::set work_set{}; + std::set component{}; + for (IndexType idx = row_ptrs[j]; idx < row_ptrs[j + 1]; + idx++) { + if (col_idxs[idx] == j) { + if (std::abs(vals[idx]) < std::abs(min_val)) { + min_val = vals[idx]; + min_idx = j; + min_component = connected_components; + } + } + work_set.insert(col_idxs[idx]); + edge_map[col_idxs[idx]] = connected_components; + } + while (!work_set.empty()) { + auto current = *work_set.begin(); + work_set.erase(current); + component.insert(edge[current]); + for (IndexType idx = row_ptrs[current]; + idx < row_ptrs[current + 1]; idx++) { + if (edge_map[col_idxs[idx]] == -1) { + work_set.insert(col_idxs[idx]); + edge_map[col_idxs[idx]] = connected_components; + } + } + } + component_vec.clear(); + for (auto const& idx : component) { + component_vec.emplace_back(idx); + } + if (component.size() == 1) { + num_corners++; + } + if (component_vec.size() > 0) { + interface_dofs.emplace_back(component_vec); + interface_dof_ranks.emplace_back(ranks); + connected_components++; + } + } + } + start += esize; + } else { + interface_dofs.emplace_back(edge_dofs[i]); + interface_dof_ranks.emplace_back(edge_ranks[i]); + } } for (size_type i = 0; i < n_corners; i++) { interface_dofs.emplace_back(corner_dofs[i]); @@ -437,7 +444,8 @@ void Bddc::generate_interfaces() std::iota(order.begin(), order.end(), 0); std::sort(order.begin(), order.end(), [&](const IndexType& a, const IndexType& b) { - return idofs[a].size() > idofs[b].size(); + /* return idofs[a].size() > idofs[b].size(); */ + return iranks[a][0] < iranks[b][0]; }); for (size_type i = 0; i < idofs.size(); i++) { interface_dofs_.emplace_back(idofs[order[i]]); @@ -503,6 +511,7 @@ void Bddc::generate_interfaces() template void Bddc::pre_solve(const LinOp* b, LinOp* x) const { + auto start = std::chrono::high_resolution_clock::now(); auto exec = this->get_executor(); auto rhs = as(b); auto sol = as(x); @@ -526,14 +535,21 @@ void Bddc::pre_solve(const LinOp* b, LinOp* x) const schur_solution->get_local_values()), 1); comm.synchronize(); - if (inner_solver->apply_uses_initial_guess()) { - if (parameters_.use_amd) { - inner_buf2->fill(zero()); - } else { - inner_intermediate->fill(zero()); - } - } + /* if (inner_solver->apply_uses_initial_guess()) { */ if (parameters_.use_amd) { + inner_buf2->fill(zero()); + } else { + inner_intermediate->fill(zero()); + } + /* } */ + /* if (parameters_.multilevel && active) { */ + /* auto inner_res = vec_type::create(exec->get_master(), dim<2>{1,1}); + */ + /* inner_rhs->compute_norm2(inner_res); */ + /* std::cout << "RANK " << comm.rank() << " INNER RHS NORM " << + * inner_res->at(0,0) << std::endl; */ + /* } */ + if (parameters_.use_amd && inner_solver->get_size()[0] > 0) { inner_rhs->permute(AMD_inner, inner_buf1, matrix::permute_mode::rows); inner_solver->apply(inner_buf1, inner_buf2); inner_buf2->permute(AMD_inner, inner_intermediate, @@ -541,16 +557,30 @@ void Bddc::pre_solve(const LinOp* b, LinOp* x) const } else { inner_solver->apply(inner_rhs, inner_intermediate); } + /* if (parameters_.multilevel && active) { */ + /* auto inner_res = vec_type::create(exec->get_master(), dim<2>{1,1}); + */ + /* inner_intermediate->compute_norm2(inner_res); */ + /* std::cout << "RANK " << comm.rank() << " INNER SOL NORM " << + * inner_res->at(0,0) << std::endl; */ + /* } */ A_gi->apply(inner_intermediate, local_schur_sol); comm.synchronize(); RGT->apply(neg_one_op, schur_solution, one_op, sol); comm.synchronize(); + auto end = std::chrono::high_resolution_clock::now(); + if (comm.rank() == 0) { + /* std::cout << "PRE SOLVE: " << + * std::chrono::duration_cast(end - + * start).count() << std::endl; */ + } } template void Bddc::post_solve(const LinOp* b, LinOp* x) const { + auto start = std::chrono::high_resolution_clock::now(); auto exec = this->get_executor(); auto rhs = as(b); auto sol = as(x); @@ -577,14 +607,14 @@ void Bddc::post_solve(const LinOp* b, LinOp* x) const span{inner_idxs_.size(), local_sol->get_size()[0]}, span{0, 1}); comm.synchronize(); A_ig->apply(neg_one_op, bound_sol, one_op, inner_rhs); - if (inner_solver->apply_uses_initial_guess()) { - if (parameters_.use_amd) { - inner_buf2->fill(zero()); - } else { - inner_sol->fill(zero()); - } - } + /* if (inner_solver->apply_uses_initial_guess()) { */ if (parameters_.use_amd) { + inner_buf2->fill(zero()); + } else { + inner_sol->fill(zero()); + } + /* } */ + if (parameters_.use_amd && inner_solver->get_size()[0] > 0) { inner_rhs->permute(AMD_inner, inner_buf1, matrix::permute_mode::rows); inner_solver->apply(inner_buf1, inner_buf2); inner_buf2->permute(AMD_inner, inner_sol, @@ -599,6 +629,12 @@ void Bddc::post_solve(const LinOp* b, LinOp* x) const sol->compute_dot(nullspace, scale_op); sol->add_scaled(scale_op, neg_nullspace); } + auto end = std::chrono::high_resolution_clock::now(); + if (comm.rank() == 0) { + /* std::cout << "POST SOLVE: " << + * std::chrono::duration_cast(end - + * start).count() << std::endl; */ + } } @@ -612,7 +648,11 @@ void Bddc::apply_dense_impl(const VectorType* dense_b, auto rhs = static_condensate.get(); auto comm = rhs->get_communicator(); dense_x->fill(zero()); - pre_solve(dense_b, rhs); + if (!parameters_.skip_static_condensation) { + pre_solve(dense_b, rhs); + } else { + rhs->copy_from(dense_b); + } // Coarse grid correction auto coarse_rhs = vec_type::create( @@ -639,28 +679,48 @@ void Bddc::apply_dense_impl(const VectorType* dense_b, phi_t->apply(coarse_1, coarse_rhs); comm.synchronize(); RCT->apply(coarse_residual, coarse_b); - if (coarse_solver->apply_uses_initial_guess()) { - coarse_x->fill(zero()); - } + /* if (coarse_solver->apply_uses_initial_guess()) { */ + coarse_x->fill(zero()); + /* } */ + auto start = std::chrono::high_resolution_clock::now(); coarse_solver->apply(coarse_b, coarse_x); + auto end = std::chrono::high_resolution_clock::now(); + if (comm.rank() == 0) { + /* std::cout << "COARSE SOLVE: " << + * std::chrono::duration_cast(end - + * start).count() << ", ITERATIONS: " << + * coarse_logger->get_num_iterations() << std::endl; */ + } + comm.synchronize(); RC->apply(coarse_x, coarse_solution); phi->apply(coarse_sol, coarse_2); + start = std::chrono::high_resolution_clock::now(); // Subdomain correction if (fallback) { - auto ge = - coarse_1->create_submatrix(span{0, edge_idxs.size()}, span{0, 1}); - auto qe = - coarse_3->create_submatrix(span{0, edge_idxs.size()}, span{0, 1}); - e_perm->apply(ge, constrained_buf2); - constrained_buf2->scale_permute( - as(mc64->get_operators()[0]), constrained_buf1, - matrix::permute_mode::rows); - constrained_solver->apply(constrained_buf1, constrained_buf2); - constrained_buf2->scale_permute( - as(mc64->get_operators()[1]), constrained_buf1, - matrix::permute_mode::rows); - e_perm_t->apply(constrained_buf1, qe); + /* auto local_size = constrained_buf1->get_size()[0] - + * inner_idxs_.size(); */ + /* auto ge = */ + /* coarse_1->create_submatrix(span{0, local_size}, span{0, 1}); */ + /* auto qe = */ + /* coarse_3->create_submatrix(span{0, local_size}, span{0, 1}); */ + e_perm->apply(coarse_1, constrained_buf2); + if (active) { + constrained_buf2->scale_permute( + as(mc64->get_operators()[0]), constrained_buf1, + matrix::permute_mode::rows); + constrained_buf1->permute(AMD, constrained_buf2, + matrix::permute_mode::rows); + constrained_solver->apply(constrained_buf2, constrained_buf1); + constrained_buf1->permute(AMD, constrained_buf2, + matrix::permute_mode::inverse_rows); + constrained_buf2->scale_permute( + as(mc64->get_operators()[1]), constrained_buf1, + matrix::permute_mode::rows); + } else { + constrained_solver->apply(constrained_buf2, constrained_buf1); + } + e_perm_t->apply(constrained_buf1, coarse_3); } else { auto qe = coarse_3->create_submatrix(span{0, edge_idxs.size()}, span{0, 1}); @@ -672,9 +732,9 @@ void Bddc::apply_dense_impl(const VectorType* dense_b, auto im = local_1->create_submatrix(span{0, edge_idxs.size()}, span{0, 1}); if (c->get_size()[0] > 0) { - if (edge_solver->apply_uses_initial_guess()) { - edge_buf2->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + edge_buf2->fill(zero()); + /* } */ e_perm->apply(ge, edge_buf1); edge_solver->apply(edge_buf1, edge_buf2); e_perm_t->apply(edge_buf2, im); @@ -683,19 +743,25 @@ void Bddc::apply_dense_impl(const VectorType* dense_b, auto mue = local_3->create_submatrix(span{0, c->get_size()[0]}, span{0, 1}); c->apply(im, schur_rhs); - if (local_schur_solver->apply_uses_initial_guess()) { - mue->fill(zero()); - } + /* if (local_schur_solver->apply_uses_initial_guess()) { */ + mue->fill(zero()); + /* } */ local_schur_solver->apply(schur_rhs, mue); cT->apply(neg_one_op, mue, one_op, ge); } - if (edge_solver->apply_uses_initial_guess()) { - edge_buf2->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + edge_buf2->fill(zero()); + /* } */ e_perm->apply(ge, edge_buf1); edge_solver->apply(edge_buf1, edge_buf2); e_perm_t->apply(edge_buf2, qe); } + end = std::chrono::high_resolution_clock::now(); + if (comm.rank() == 0) { + /* std::cout << "LOCAL SOLVE: " << + * std::chrono::duration_cast(end - + * start).count() << std::endl; */ + } coarse_2->add_scaled(one_op, coarse_3); weights->apply(coarse_2, local_sol); @@ -741,16 +807,18 @@ void Bddc::generate() auto host = exec->get_master(); auto comm = global_system_matrix_->get_communicator(); auto rank = comm.rank(); - for (auto bdry_idx : parameters_.boundary_idxs) { - std::cout << "RANK " << rank << ": " << bdry_idx << std::endl; - } std::shared_ptr< const gko::experimental::distributed::Partition> - partition = share( + h_partition = share( clone(host, global_system_matrix_->get_row_partition().get())); + std::shared_ptr< + const gko::experimental::distributed::Partition> + partition = share( + clone(exec, global_system_matrix_->get_row_partition().get())); inner_idxs_ = parameters_.interior_dofs; interf_idxs_ = parameters_.interf_dofs; auto mat_data = global_system_matrix_->get_matrix_data().copy_to_host(); + mat_data.remove_zeros(); mat_data.sort_row_major(); if (inner_idxs_.size() == 0 || interf_idxs_.size() == 0) { std::set local_idxs; @@ -758,7 +826,7 @@ void Bddc::generate() std::vector non_local_cnts(comm.size(), 0); for (auto entry : mat_data.nonzeros) { auto idx = entry.row; - auto owner = find_part(partition, idx); + auto owner = find_part(h_partition, idx); if (owner == rank) { local_idxs.insert(idx); } else { @@ -773,8 +841,9 @@ void Bddc::generate() non_local_vec.emplace_back(non_local_idx); } std::sort(non_local_vec.begin(), non_local_vec.end(), - [partition](auto a, auto b) { - return find_part(partition, a) < find_part(partition, b); + [h_partition](auto a, auto b) { + return find_part(h_partition, a) < + find_part(h_partition, b); }); std::vector non_local_offsets(comm.size() + 1, 0); std::partial_sum(non_local_cnts.begin(), non_local_cnts.end(), @@ -806,7 +875,6 @@ void Bddc::generate() comm.synchronize(); auto n_inner = inner_idxs_.size(); - size_type n_interfaces = interfaces_.size(); size_type n_edges = edges.size(); size_type n_corners = corners.size(); @@ -825,8 +893,14 @@ void Bddc::generate() global_to_local.emplace(corner_idxs[i], n_inner + n_e_idxs + i); } - if (n_corners == 0) { + if (n_inner + n_interface_idxs == 0) { + std::cout << "RANK " << comm.rank() << " IS INACTIVE." << std::endl; + active = false; + } + + if (n_corners == 0 && active) { std::cout << "No corners found on rank " << rank << std::endl; + fallback = true; } // Set up Restriction Operator @@ -843,14 +917,16 @@ void Bddc::generate() schur_range_bounds.begin() + 1); auto ranges_array = array(host, range_bounds.begin(), range_bounds.end()); + ranges_array.set_executor(exec); auto R_part = gko::share( gko::experimental::distributed::Partition< - IndexType, IndexType>::build_from_contiguous(host, ranges_array)); + IndexType, IndexType>::build_from_contiguous(exec, ranges_array)); auto schur_ranges_array = array(host, schur_range_bounds.begin(), schur_range_bounds.end()); + schur_ranges_array.set_executor(exec); auto schur_part = gko::share( gko::experimental::distributed::Partition< - IndexType, IndexType>::build_from_contiguous(host, + IndexType, IndexType>::build_from_contiguous(exec, schur_ranges_array)); matrix_data R_data(dim<2>{ range_bounds[comm.size()], global_system_matrix_->get_size()[0]}); @@ -924,19 +1000,6 @@ void Bddc::generate() IDG = global_matrix_type::create(exec, comm); IDG->read_distributed(IDG_data, partition); - // Set up coarse partition - gko::array mapping{ - host, n_interfaces}; - for (size_type i = 0; i < n_interfaces; i++) { - auto ranks = interface_dof_ranks_[interfaces_[i]]; - auto owner = *std::min_element(ranks.begin(), ranks.end()); - mapping.get_data()[i] = 0; // owner; - } - auto part = - gko::share(gko::experimental::distributed::Partition< - IndexType, IndexType>::build_from_mapping(host, mapping, - comm.size())); - // Generate local matrix data and communication pattern matrix_data local_data( dim<2>(n_inner + n_interface_idxs, n_inner + n_interface_idxs)); @@ -1000,6 +1063,11 @@ void Bddc::generate() auto A_ii = share(local->create_submatrix(span{0, n_inner}, span{0, n_inner})); + /* if (!parameters_.multilevel && active) { */ + /* std::ofstream out_inner{"inner_" + std::to_string(comm.rank()) + + * ".mtx"}; */ + /* write(out_inner, A_ii); */ + /* } */ A_ig = local->create_submatrix(span{0, n_inner}, span{n_inner, n_inner + n_interface_idxs}); A_gi = local->create_submatrix(span{n_inner, n_inner + n_interface_idxs}, @@ -1020,19 +1088,28 @@ void Bddc::generate() span{n_inner + n_e_idxs, n_inner + n_interface_idxs})); // Generate inner solver and define Schur complement action - if (parameters_.use_amd) { - AMD_inner = share( - experimental::reorder::Amd::build().on(exec)->generate( - A_ii)); - auto A_ii_amd = share(A_ii->permute(AMD_inner)); - std::swap(A_ii, A_ii_amd); + if (n_inner > 0) { + if (parameters_.use_amd) { + AMD_inner = share(experimental::reorder::Amd::build() + .on(exec) + ->generate(A_ii)); + auto A_ii_amd = share(A_ii->permute(AMD_inner)); + std::swap(A_ii, A_ii_amd); + } + inner_solver = share(parameters_.inner_solver_factory->generate(A_ii)); + } else { + inner_solver = gko::matrix::Identity::create(exec, 0); } - inner_solver = share(parameters_.local_solver_factory->generate(A_ii)); one_op = share(initialize({one()}, exec)); neg_one_op = share(initialize({-one()}, exec)); auto zero_op = share(initialize({zero()}, exec)); - if (n_corners > 0) { + matrix_data coarse_data( + dim<2>{n_interfaces, n_interfaces}); + /* fallback = true; */ + if (!fallback) { + /* std::cout << "RANK " << comm.rank() << " STARTING DEFAULT SETUP" << + * std::endl; */ // Generate edge constraints matrix_data C_data( dim<2>{n_edges, n_inner + n_e_idxs}); @@ -1058,8 +1135,7 @@ void Bddc::generate() // Generate edge and local Schur complement solvers matrix_data e_perm_data( dim<2>{n_e_idxs, n_inner + n_e_idxs}); - std::shared_ptr AMD; - if (parameters_.use_amd) { + if (parameters_.use_amd && active) { AMD = share(experimental::reorder::Amd::build() .on(exec) ->generate(A_ee)); @@ -1077,11 +1153,14 @@ void Bddc::generate() one()); } } + e_perm_data.sort_row_major(); e_perm_t = share(matrix_type::create(exec)); e_perm_t->read(e_perm_data); e_perm = share(as(e_perm_t->transpose())); - edge_solver = share(parameters_.local_solver_factory->generate(A_ee)); + edge_solver = + active ? share(parameters_.local_solver_factory->generate(A_ee)) + : share(gko::matrix::Identity::create(exec, 0)); auto interm = vec_type::create(exec, dim<2>{n_inner + n_e_idxs, n_edges}); @@ -1094,18 +1173,18 @@ void Bddc::generate() span{edge, edge + 1}); auto interm_edge = interm->create_submatrix( span{0, n_inner + n_e_idxs}, span{edge, edge + 1}); - if (parameters_.use_amd) { + if (parameters_.use_amd && active) { CCT_edge->permute(AMD, edge_buf1, matrix::permute_mode::rows); - if (edge_solver->apply_uses_initial_guess()) { - edge_buf2->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + edge_buf2->fill(zero()); + /* } */ edge_solver->apply(edge_buf1, edge_buf2); edge_buf2->permute(AMD, interm_edge, matrix::permute_mode::inverse_rows); } else { - if (edge_solver->apply_uses_initial_guess()) { - interm_edge->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + interm_edge->fill(zero()); + /* } */ edge_solver->apply(CCT_edge, interm_edge); } } @@ -1159,19 +1238,19 @@ void Bddc::generate() span{0, n_inner + n_e_idxs}, span{i, i + 1}); auto interm_edge = schur_interm->create_submatrix( span{0, n_inner + n_e_idxs}, span{i, i + 1}); - if (parameters_.use_amd) { + if (parameters_.use_amd && active) { rhs_edge->permute(AMD, edge_buf1, matrix::permute_mode::rows); - if (edge_solver->apply_uses_initial_guess()) { - edge_buf2->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + edge_buf2->fill(zero()); + /* } */ edge_solver->apply(edge_buf1, edge_buf2); edge_buf2->permute(AMD, interm_edge, matrix::permute_mode::inverse_rows); } else { - if (edge_solver->apply_uses_initial_guess()) { - interm_edge->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + interm_edge->fill(zero()); + /* } */ edge_solver->apply(rhs_edge, interm_edge); } } @@ -1196,18 +1275,18 @@ void Bddc::generate() span{i, i + 1}); auto phi_edge = phi_e->create_submatrix(span{0, n_inner + n_e_idxs}, span{i, i + 1}); - if (parameters_.use_amd) { + if (parameters_.use_amd && active) { rhs_edge->permute(AMD, edge_buf1, matrix::permute_mode::rows); - if (edge_solver->apply_uses_initial_guess()) { - edge_buf2->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + edge_buf2->fill(zero()); + /* } */ edge_solver->apply(edge_buf1, edge_buf2); edge_buf2->permute(AMD, phi_edge, matrix::permute_mode::inverse_rows); } else { - if (edge_solver->apply_uses_initial_guess()) { - phi_edge->fill(zero()); - } + /* if (edge_solver->apply_uses_initial_guess()) { */ + phi_edge->fill(zero()); + /* } */ edge_solver->apply(rhs_edge, phi_edge); } } @@ -1220,36 +1299,62 @@ void Bddc::generate() span{0, n_corners + n_edges})); phi_t = as(phi->transpose()); auto h_lambda = clone(host, lambda); - matrix_data coarse_data( - dim<2>{n_interfaces, n_interfaces}); for (size_type i = 0; i < n_edges; i++) { - for (size_type j = 0; j < n_edges; j++) { + coarse_data.nonzeros.emplace_back(edges[i], edges[i], + -h_lambda->at(i, i)); + for (size_type j = i + 1; j < n_edges; j++) { coarse_data.nonzeros.emplace_back(edges[i], edges[j], -h_lambda->at(i, j)); + coarse_data.nonzeros.emplace_back(edges[j], edges[i], + -h_lambda->at(j, i)); } + /* for (size_type j = 0; j < n_edges; j++) { */ + /* coarse_data.nonzeros.emplace_back(edges[i], edges[j], */ + /* -h_lambda->at(i, j)); */ + /* } */ for (size_type j = 0; j < n_corners; j++) { coarse_data.nonzeros.emplace_back( edges[i], corners[j], -h_lambda->at(i, n_edges + j)); coarse_data.nonzeros.emplace_back( - corners[j], edges[i], -h_lambda->at(i, n_edges + j)); + corners[j], edges[i], -h_lambda->at(n_edges + j, i)); } } for (size_type i = 0; i < n_corners; i++) { - for (size_type j = 0; j < n_corners; j++) { + coarse_data.nonzeros.emplace_back( + corners[i], corners[i], + -h_lambda->at(n_edges + i, n_edges + i)); + for (size_type j = i + 1; j < n_corners; j++) { coarse_data.nonzeros.emplace_back( corners[i], corners[j], -h_lambda->at(n_edges + i, n_edges + j)); + coarse_data.nonzeros.emplace_back( + corners[j], corners[i], + -h_lambda->at(n_edges + j, n_edges + i)); } + /* for (size_type j = 0; j < n_corners; j++) { */ + /* coarse_data.nonzeros.emplace_back( */ + /* corners[i], corners[j], */ + /* -h_lambda->at(n_edges + i, n_edges + j)); */ + /* } */ } - global_coarse_matrix_ = global_matrix_type::create(exec, comm); - global_coarse_matrix_->read_distributed( - coarse_data, part, - gko::experimental::distributed::assembly::communicate); + /* if (comm.rank() == 0) { */ + /* std::ofstream out_coarse{"coarse.mtx"}; */ + /* gko::write(out_coarse, + * as(global_coarse_matrix_->get_local_matrix())); */ + /* out_coarse << std::flush; */ + /* } */ + /* std::cout << "RANK " << comm.rank() << " DONE WITH DEFAULT SETUP" << + * std::endl; */ } else { + /* std::cout << "RANK " << comm.rank() << " STARTING FALLBACK SETUP" << + * std::endl; */ + if (!parameters_.use_corners) { + n_corners = 0; + } auto n_rows = local->get_size()[0]; - local_data.size = dim<2>{n_rows + n_edges, n_rows + n_edges}; + local_data.size = + dim<2>{n_rows + n_edges + n_corners, n_rows + n_edges + n_corners}; for (size_type i = 0; i < n_edges; i++) { - std::cout << n_rows + i << std::endl; auto edge = interface_dofs_[interfaces_[edges[i]]]; ValueType val = one() / static_cast(edge.size()); @@ -1260,6 +1365,15 @@ void Bddc::generate() n_rows + i, val); } } + for (size_type i = 0; i < n_corners; i++) { + auto corner = + global_to_local.at(interface_dofs_[interfaces_[corners[i]]][0]); + auto constraint = n_rows + n_edges + i; + local_data.nonzeros.emplace_back(constraint, corner, + one()); + local_data.nonzeros.emplace_back(corner, constraint, + one()); + } auto constrained_mat = share(matrix_type::create(exec)); local_data.sort_row_major(); constrained_mat->read(local_data); @@ -1271,85 +1385,266 @@ void Bddc::generate() auto perm_constrained_mat = share(constrained_mat->scale_permute(row_op, col_op)); std::swap(constrained_mat, perm_constrained_mat); - auto phi_whole = - vec_type::create(exec, dim<2>{n_rows + n_edges, n_edges}); + if (active) { + AMD = share(experimental::reorder::Amd::build() + .on(exec) + ->generate(constrained_mat)); + auto h_AMD_inv = clone(host, AMD)->compute_inverse(); + perm_constrained_mat = share(constrained_mat->permute(AMD)); + std::swap(constrained_mat, perm_constrained_mat); + constrained_solver = + gko::experimental::solver::Direct::build() + .with_factorization( + gko::experimental::factorization::Lu::build() + .on(exec)) + .on(exec) + ->generate(constrained_mat); + } else { + constrained_solver = + gko::matrix::Identity::create(exec, 0); + } + auto phi_whole = vec_type::create( + exec, dim<2>{n_rows + n_edges + n_corners, n_edges + n_corners}); phi_whole->fill(zero()); auto h_constrained_rhs = vec_type::create( - exec->get_master(), dim<2>{n_rows + n_edges, n_edges}); + exec->get_master(), + dim<2>{n_rows + n_edges + n_corners, n_edges + n_corners}); h_constrained_rhs->fill(zero()); - for (size_type i = 0; i < n_edges; i++) { + for (size_type i = 0; i < n_edges + n_corners; i++) { h_constrained_rhs->at(n_rows + i, i) = one(); } auto constrained_rhs = clone(exec, h_constrained_rhs); - constrained_solver = - gko::experimental::solver::Direct::build() - .with_factorization( - gko::experimental::factorization::Lu::build() - .on(exec)) - .on(exec) - ->generate(constrained_mat); - constrained_buf1 = vec_type::create(exec, dim<2>{n_rows + n_edges, 1}); - constrained_buf2 = vec_type::create(exec, dim<2>{n_rows + n_edges, 1}); + constrained_buf1 = + vec_type::create(exec, dim<2>{n_rows + n_edges + n_corners, 1}); + constrained_buf2 = + vec_type::create(exec, dim<2>{n_rows + n_edges + n_corners, 1}); - for (size_type i = 0; i < n_edges; i++) { + for (size_type i = 0; i < n_edges + n_corners; i++) { auto e_rhs = constrained_rhs->create_submatrix( - span{0, n_rows + n_edges}, span{i, i + 1}); - auto e_sol = phi_whole->create_submatrix(span{0, n_rows + n_edges}, - span{i, i + 1}); - /* constrained_solver->apply(e_rhs, e_sol); */ - e_rhs->scale_permute(row_op, constrained_buf1, + span{0, n_rows + n_edges + n_corners}, span{i, i + 1}); + auto e_sol = phi_whole->create_submatrix( + span{0, n_rows + n_edges + n_corners}, span{i, i + 1}); + e_rhs->scale_permute(row_op, constrained_buf2, matrix::permute_mode::rows); + constrained_buf2->permute(AMD, constrained_buf1, + matrix::permute_mode::rows); constrained_solver->apply(constrained_buf1, constrained_buf2); - constrained_buf2->scale_permute(col_op, e_sol, + constrained_buf2->permute(AMD, constrained_buf1, + matrix::permute_mode::inverse_rows); + constrained_buf1->scale_permute(col_op, e_sol, matrix::permute_mode::rows); } phi = clone(phi_whole->create_submatrix(span{n_inner, n_rows}, - span{0, n_edges})); + span{0, n_edges + n_corners})); phi_t = as(phi->transpose()); auto lambda = clone(host, phi_whole->create_submatrix( - span{n_rows, n_rows + n_edges}, span{0, n_edges})); - matrix_data coarse_data( - dim<2>{n_interfaces, n_interfaces}); + span{n_rows, n_rows + n_edges + n_corners}, + span{0, n_edges + n_corners})); for (size_type i = 0; i < n_edges; i++) { for (size_type j = 0; j < n_edges; j++) { coarse_data.nonzeros.emplace_back(edges[i], edges[j], -lambda->at(i, j)); } + for (size_type j = 0; j < n_corners; j++) { + coarse_data.nonzeros.emplace_back(edges[i], corners[j], + -lambda->at(i, n_edges + j)); + coarse_data.nonzeros.emplace_back(corners[j], edges[i], + -lambda->at(n_edges + j, i)); + } + } + for (size_type i = 0; i < n_corners; i++) { + coarse_data.nonzeros.emplace_back( + corners[i], corners[i], -lambda->at(n_edges + i, n_edges + i)); + for (size_type j = i + 1; j < n_corners; j++) { + coarse_data.nonzeros.emplace_back( + corners[i], corners[j], + -lambda->at(n_edges + i, n_edges + j)); + coarse_data.nonzeros.emplace_back( + corners[j], corners[i], + -lambda->at(n_edges + j, n_edges + i)); + } } - global_coarse_matrix_ = global_matrix_type::create(exec, comm); - global_coarse_matrix_->read_distributed( - coarse_data, part, - gko::experimental::distributed::assembly::communicate); matrix_data e_perm_data( - dim<2>{n_rows + n_edges, n_e_idxs}); + dim<2>{n_rows + n_edges + n_corners, n_e_idxs + n_corners}); matrix_data e_perm_t_data( - dim<2>{n_e_idxs, n_rows + n_edges}); + dim<2>{n_e_idxs + n_corners, n_rows + n_edges + n_corners}); auto h_mc64 = clone(exec, mc64); auto h_row_op = as(h_mc64->get_operators()[0]); auto h_col_op = as(h_mc64->get_operators()[1]); auto h_row_inv = h_row_op->compute_inverse(); auto h_col_inv = h_col_op->compute_inverse(); - for (size_type i = 0; i < n_e_idxs; i++) { + for (size_type i = 0; i < n_e_idxs + n_corners; i++) { e_perm_data.nonzeros.emplace_back(n_inner + i, i, one()); e_perm_t_data.nonzeros.emplace_back(i, n_inner + i, one()); - /* e_perm_data.nonzeros.emplace_back(h_row_op->get_const_permutation()[n_inner - * + i], i, h_row_op->get_const_scaling_factors()[n_inner + i]); */ - /* e_perm_t_data.nonzeros.emplace_back(i, - * h_col_inv->get_const_permutation()[n_inner + i], - * h_col_inv->get_const_scaling_factors()[n_inner + i]); */ } + e_perm_data.sort_row_major(); + e_perm_t_data.sort_row_major(); e_perm_t = share(matrix_type::create(exec)); e_perm_t->read(e_perm_t_data); e_perm = share(matrix_type::create(exec)); e_perm->read(e_perm_data); - fallback = true; + /* std::cout << "RANK " << comm.rank() << " DONE WITH FALLBACK SETUP" << + * std::endl; */ } + coarse_data.remove_zeros(); + coarse_data.sort_row_major(); + + gko::array mapping{ + host, n_interfaces}; + /* std::cout << "RANK " << comm.rank() << " STARTING COARSE SETUP" << + * std::endl; */ + if (parameters_.multilevel) { + // Set up coarse mesh for ParMETIS + std::vector elmdist(comm.size() + 1); + std::iota(elmdist.begin(), elmdist.end(), 0); + std::vector eptr{0, n_edges + n_corners}; + std::vector eind(n_edges + n_corners); + for (size_type i = 0; i < n_edges; i++) { + eind[i] = edges[i]; + } + for (size_type i = 0; i < n_corners; i++) { + eind[n_edges + i] = corners[i]; + } + int elmwgt = 0; + int numflag = 0; + int ncon = 1; + int ncommonnodes = 2; + int nparts = comm.size() / parameters_.coarsening_ratio; + std::vector tpwgts(ncon * nparts, 1. / nparts); + std::vector ubvec(ncon, 1.05); + int options = 0; + int edgecut; + int new_part = comm.rank(); + MPI_Comm commptr = comm.get(); + + int ret = ParMETIS_V3_PartMeshKway( + elmdist.data(), eptr.data(), eind.data(), NULL, &elmwgt, &numflag, + &ncon, &ncommonnodes, &nparts, tpwgts.data(), ubvec.data(), + &options, &edgecut, &new_part, &commptr); + + /* std::cout << "METIS RETURNED WITH " << ret << " ON RANK " << + * comm.rank() << ", IS GOING TO PART " << new_part << std::endl; */ + + std::vector new_parts(comm.size()); + comm.all_gather(exec, &new_part, 1, new_parts.data(), 1); + comm.synchronize(); + + int elem_size = coarse_data.nonzeros.size(); + int elem_cnt = 0; + for (auto p : new_parts) { + if (p == comm.rank()) { + elem_cnt++; + } + } + std::vector elem_sizes(elem_cnt); + comm.i_send(exec, &elem_size, 1, new_part, 0); + size_type i = 0; + for (size_type j = 0; j < comm.size(); j++) { + auto p = new_parts[j]; + if (p == comm.rank()) { + comm.recv(exec, elem_sizes.data() + i, 1, j, 0); + i++; + } + } + comm.synchronize(); + + std::vector elem_offsets(elem_cnt + 1, 0); + std::partial_sum(elem_sizes.begin(), elem_sizes.end(), + elem_offsets.begin() + 1); + + std::vector send_row_idxs(elem_size); + std::vector send_col_idxs(elem_size); + std::vector send_values(elem_size); + for (size_type i = 0; i < elem_size; i++) { + send_row_idxs[i] = coarse_data.nonzeros[i].row; + send_col_idxs[i] = coarse_data.nonzeros[i].column; + send_values[i] = coarse_data.nonzeros[i].value; + } + std::vector recv_row_idxs(elem_offsets.back()); + std::vector recv_col_idxs(elem_offsets.back()); + std::vector recv_values(elem_offsets.back()); + + comm.i_send(exec, send_row_idxs.data(), elem_size, new_part, 0); + i = 0; + for (size_type j = 0; j < comm.size(); j++) { + auto p = new_parts[j]; + if (p == comm.rank()) { + comm.recv(exec, recv_row_idxs.data() + elem_offsets[i], + elem_sizes[i], j, 0); + i++; + } + } + comm.synchronize(); + comm.i_send(exec, send_col_idxs.data(), elem_size, new_part, 0); + i = 0; + for (size_type j = 0; j < comm.size(); j++) { + auto p = new_parts[j]; + if (p == comm.rank()) { + comm.recv(exec, recv_col_idxs.data() + elem_offsets[i], + elem_sizes[i], j, 0); + i++; + } + } + comm.synchronize(); + comm.i_send(exec, send_values.data(), elem_size, new_part, 0); + i = 0; + for (size_type j = 0; j < comm.size(); j++) { + auto p = new_parts[j]; + if (p == comm.rank()) { + comm.recv(exec, recv_values.data() + elem_offsets[i], + elem_sizes[i], j, 0); + i++; + } + } + comm.synchronize(); + + matrix_data complete_coarse_data( + coarse_data.size); + for (size_type i = 0; i < elem_offsets.back(); i++) { + complete_coarse_data.nonzeros.emplace_back( + recv_row_idxs[i], recv_col_idxs[i], recv_values[i]); + } + complete_coarse_data.sum_duplicates(); + + for (size_type i = 0; i < n_interfaces; i++) { + auto ranks = interface_dof_ranks_[interfaces_[i]]; + auto owner = new_parts[ranks[0]]; + /* std::cout << owner << std::endl; */ + mapping.get_data()[i] = owner; + } + coarse_data = complete_coarse_data; + } else { + for (size_type i = 0; i < n_interfaces; i++) { + auto ranks = interface_dof_ranks_[interfaces_[i]]; + auto owner = ranks[0]; + /* std::cout << owner << std::endl; */ + mapping.get_data()[i] = owner; + } + } + /* std::cout << "RANK " << comm.rank() << " DONE WITH COARSE SETUP" << + * std::endl; */ + // Set up coarse partition + mapping.set_executor(exec); + auto part = + gko::share(gko::experimental::distributed::Partition< + IndexType, IndexType>::build_from_mapping(exec, mapping, + comm.size())); + + + global_coarse_matrix_ = global_matrix_type::create(exec, comm); + global_coarse_matrix_->read_distributed( + coarse_data, part, + gko::experimental::distributed::assembly::communicate); coarse_solver = parameters_.coarse_solver_factory->generate(global_coarse_matrix_); + coarse_logger = gko::log::Convergence::create(); + coarse_solver->add_logger(coarse_logger); + /* std::cout << "RANK " << comm.rank() << " STARTING COARSE RESTRICTION" << + * std::endl; */ // Set up coarse restriction operator IndexType coarse_local_size = n_edges + n_corners; std::vector coarse_local_sizes(comm.size()); @@ -1357,11 +1652,13 @@ void Bddc::generate() std::vector coarse_range_bounds(comm.size() + 1, 0); std::partial_sum(coarse_local_sizes.begin(), coarse_local_sizes.end(), coarse_range_bounds.begin() + 1); - auto coarse_ranges_array = array( + auto h_coarse_ranges_array = array( host, coarse_range_bounds.begin(), coarse_range_bounds.end()); + auto coarse_ranges_array = array(exec); + coarse_ranges_array = h_coarse_ranges_array; auto RC_part = gko::share( gko::experimental::distributed::Partition< - IndexType, IndexType>::build_from_contiguous(host, + IndexType, IndexType>::build_from_contiguous(exec, coarse_ranges_array)); matrix_data RC_data( dim<2>{coarse_range_bounds[comm.size()], @@ -1390,9 +1687,12 @@ void Bddc::generate() RCT->read_distributed( RCT_data, part, RC_part, gko::experimental::distributed::assembly::communicate); + /* std::cout << "RANK " << comm.rank() << " DONE WITH COARSE RESTRICTION" << + * std::endl; */ + /* std::cout << "RANK " << comm.rank() << " STARTING WEIGHTS" << std::endl; + */ // Generate weights - // auto diag = clone(host, global_coarse_matrix_->extract_diagonal()); auto diag = global_system_matrix_->extract_diagonal(); auto diag_array = make_const_array_view(exec, diag->get_size()[0], diag->get_const_values()); @@ -1413,33 +1713,14 @@ void Bddc::generate() auto local_diag_vec = vec_type::create_const( exec, dim<2>{n_interface_idxs, 1}, std::move(local_diag_array), 1); - /* auto diag_max = std::max_element(local_diag_array.get_const_data(), */ - /* local_diag_array.get_const_data() + - * n_interface_idxs, */ - /* [](const ValueType& a, const ValueType& - * b) { */ - /* return std::abs(a) < std::abs(b); */ - /* }); */ - /* auto local_v = restricted_residual->clone(); */ - /* auto global_v = global_diag_vec->clone(); */ - /* local_v->fill(*diag_max); */ - /* RGT->apply(local_v, global_v); */ - /* RG->apply(global_v, local_v); */ - /* auto local_weights = vec_type::create(exec, dim<2>{n_interface_idxs, 1}); - */ - /* local_weights->fill(*diag_max); */ - /* auto gda = make_const_array_view(exec, n_interface_idxs, - * local_v->get_const_local_values()); */ - /* auto gd = diag_type::create_const(exec, n_interface_idxs, - * std::move(gda)); */ - auto weights_vec = vec_type::create(exec, dim<2>{n_interface_idxs, 1}); global_diag->inverse_apply(local_diag_vec, weights_vec); - // gd->inverse_apply(local_weights, weights_vec); auto weights_array = make_const_array_view(exec, n_interface_idxs, weights_vec->get_const_values()); weights = clone(diag_type::create_const(exec, n_interface_idxs, std::move(weights_array))); + /* std::cout << "RANK " << comm.rank() << " DONE WITH WEIGHTS" << std::endl; + */ comm.synchronize(); diff --git a/core/distributed/preconditioner/schwarz.cpp b/core/distributed/preconditioner/schwarz.cpp index d5466cd003a..4c1e1a68b9e 100644 --- a/core/distributed/preconditioner/schwarz.cpp +++ b/core/distributed/preconditioner/schwarz.cpp @@ -122,11 +122,16 @@ void Schwarz::generate( } if (parameters_.local_solver) { - this->set_solver(gko::share(parameters_.local_solver->generate( - as>(system_matrix) - ->get_local_matrix()))); - + if (as>( + system_matrix) + ->get_local_matrix() + ->get_size()[0] > 0) { + this->set_solver(gko::share(parameters_.local_solver->generate( + as>(system_matrix) + ->get_local_matrix()))); + } } else { this->set_solver(parameters_.generated_local_solver); } diff --git a/core/matrix/diagonal.cpp b/core/matrix/diagonal.cpp index 1a442ffc789..7388276b450 100644 --- a/core/matrix/diagonal.cpp +++ b/core/matrix/diagonal.cpp @@ -338,6 +338,7 @@ Diagonal::Diagonal(std::shared_ptr exec, : EnableLinOp(exec, dim<2>(size)), values_{exec, std::move(values)} { + if (size == 0) return; GKO_ENSURE_IN_BOUNDS(size - 1, values_.get_size()); } diff --git a/include/ginkgo/core/distributed/preconditioner/bddc.hpp b/include/ginkgo/core/distributed/preconditioner/bddc.hpp index df2be72f2f1..293d0bda9c8 100644 --- a/include/ginkgo/core/distributed/preconditioner/bddc.hpp +++ b/include/ginkgo/core/distributed/preconditioner/bddc.hpp @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -84,6 +85,9 @@ class Bddc : public EnableLinOp> { std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( local_solver_factory, nullptr); + std::shared_ptr GKO_FACTORY_PARAMETER_SCALAR( + inner_solver_factory, nullptr); + /** * Schur complement solver factory. */ @@ -133,6 +137,14 @@ class Bddc : public EnableLinOp> { bool GKO_FACTORY_PARAMETER_SCALAR(use_edges, true); bool GKO_FACTORY_PARAMETER_SCALAR(use_faces, false); + + bool GKO_FACTORY_PARAMETER_SCALAR(multilevel, false); + + bool GKO_FACTORY_PARAMETER_SCALAR(connected_components_analysis, false); + + bool GKO_FACTORY_PARAMETER_SCALAR(skip_static_condensation, false); + + index_type GKO_FACTORY_PARAMETER_SCALAR(coarsening_ratio, 8); }; GKO_ENABLE_LIN_OP_FACTORY(Bddc, parameters, Factory); GKO_ENABLE_BUILD_METHOD(Factory); @@ -191,6 +203,7 @@ class Bddc : public EnableLinOp> { LinOp* x) const override; private: + std::shared_ptr> coarse_logger; std::shared_ptr A_ig; std::shared_ptr A_gi; std::shared_ptr c; @@ -207,13 +220,15 @@ class Bddc : public EnableLinOp> { std::shared_ptr inner_solver; std::shared_ptr edge_solver; std::shared_ptr local_schur_solver; - std::shared_ptr coarse_solver; + std::shared_ptr coarse_solver; std::shared_ptr constrained_solver; + std::shared_ptr AMD; std::shared_ptr constrained_buf1; std::shared_ptr constrained_buf2; std::shared_ptr mc64; bool fallback = false; - std::shared_ptr weights; + bool active = true; + std::shared_ptr weights; std::shared_ptr phi; std::shared_ptr phi_t; std::vector interfaces_;