ginkgo-project · yhmtsai · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/common/cuda_hip/factorization/ilu_kernels.cpp b/common/cuda_hip/factorization/ilu_kernels.cpp
@@ -17,8 +17,8 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m)
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m)
 {
     const auto id = exec->get_device_id();
     auto handle = exec->get_sparselib_handle();
@@ -55,7 +55,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization

diff --git a/common/cuda_hip/factorization/lu_kernels.cpp b/common/cuda_hip/factorization/lu_kernels.cpp
@@ -85,7 +85,7 @@ __global__ __launch_bounds__(default_block_size) void initialize(
 }
 
 
-template <typename ValueType, typename IndexType>
+template <bool has_all_fillin, typename ValueType, typename IndexType>
 __global__ __launch_bounds__(default_block_size) void factorize(
     const IndexType* __restrict__ row_ptrs, const IndexType* __restrict__ cols,
     const IndexType* __restrict__ storage_offsets,
@@ -130,8 +130,16 @@ __global__ __launch_bounds__(default_block_size) void factorize(
              upper_nz += config::warp_size) {
             const auto upper_col = cols[upper_nz];
             const auto upper_val = vals[upper_nz];
-            const auto output_pos = lookup.lookup_unsafe(upper_col) + row_begin;
-            vals[output_pos] -= scale * upper_val;
+            if (!has_all_fillin) {
+                const auto pos = lookup[upper_col];
+                if (pos != invalid_index<IndexType>()) {
+                    vals[row_begin + pos] -= scale * upper_val;
+                }
+            } else {
+                const auto output_pos =
+                    lookup.lookup_unsafe(upper_col) + row_begin;
+                vals[output_pos] -= scale * upper_val;
+            }
         }
     }
     scheduler.mark_ready();
@@ -252,19 +260,29 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage)
 {
     const auto num_rows = factors->get_size()[0];
     if (num_rows > 0) {
         syncfree_storage storage(exec, tmp_storage, num_rows);
         const auto num_blocks =
             ceildiv(num_rows, default_block_size / config::warp_size);
-        kernel::factorize<<<num_blocks, default_block_size, 0,
-                            exec->get_stream()>>>(
-            factors->get_const_row_ptrs(), factors->get_const_col_idxs(),
-            lookup_offsets, lookup_storage, lookup_descs, diag_idxs,
-            as_device_type(factors->get_values()), storage, num_rows);
+        if (!has_all_fillin) {
+            kernel::factorize<false>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        } else {
+            kernel::factorize<true>
+                <<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
+                    factors->get_const_row_ptrs(),
+                    factors->get_const_col_idxs(), lookup_offsets,
+                    lookup_storage, lookup_descs, diag_idxs,
+                    as_device_type(factors->get_values()), storage, num_rows);
+        }
     }
 }
 

diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
@@ -873,7 +873,7 @@ GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
 namespace ilu_factorization {
 
 
-GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization

diff --git a/core/factorization/factorization.cpp b/core/factorization/factorization.cpp
@@ -31,10 +31,25 @@ GKO_REGISTER_OPERATION(initialize_l, factorization::initialize_l);
 
 template <typename ValueType, typename IndexType>
 std::unique_ptr<Factorization<ValueType, IndexType>>
-Factorization<ValueType, IndexType>::unpack() const
+Factorization<ValueType, IndexType>::unpack(
+    std::shared_ptr<typename matrix_type::strategy_type> lower_factor_strategy,
+    std::shared_ptr<typename matrix_type::strategy_type> upper_factor_strategy)
+    const
 {
     const auto exec = this->get_executor();
     const auto size = this->get_size();
+    auto create_matrix = [](auto exec, auto size, auto vals, auto col_idxs,
+                            auto row_ptrs, auto strategy) {
+        if (strategy == nullptr) {
+            return matrix_type::create(exec, size, std::move(vals),
+                                       std::move(col_idxs),
+                                       std::move(row_ptrs));
+        } else {
+            return matrix_type::create(exec, size, std::move(vals),
+                                       std::move(col_idxs), std::move(row_ptrs),
+                                       strategy);
+        }
+    };
     switch (this->get_storage_type()) {
     case storage_type::empty:
         GKO_NOT_SUPPORTED(nullptr);
@@ -53,12 +68,14 @@ Factorization<ValueType, IndexType>::unpack() const
         const auto u_nnz =
             static_cast<size_type>(get_element(u_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, l_nnz},
-            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
-        auto u_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, u_nnz},
-            array<index_type>{exec, u_nnz}, std::move(u_row_ptrs));
+        auto l_mtx =
+            create_matrix(exec, size, array<value_type>{exec, l_nnz},
+                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
+                          lower_factor_strategy);
+        auto u_mtx =
+            create_matrix(exec, size, array<value_type>{exec, u_nnz},
+                          array<index_type>{exec, u_nnz}, std::move(u_row_ptrs),
+                          upper_factor_strategy);
         // fill matrices
         exec->run(make_initialize_l_u(mtx.get(), l_mtx.get(), u_mtx.get()));
         return create_from_composition(
@@ -72,9 +89,10 @@ Factorization<ValueType, IndexType>::unpack() const
         const auto l_nnz =
             static_cast<size_type>(get_element(l_row_ptrs, size[0]));
         // create matrices
-        auto l_mtx = matrix_type::create(
-            exec, size, array<value_type>{exec, l_nnz},
-            array<index_type>{exec, l_nnz}, std::move(l_row_ptrs));
+        auto l_mtx =
+            create_matrix(exec, size, array<value_type>{exec, l_nnz},
+                          array<index_type>{exec, l_nnz}, std::move(l_row_ptrs),
+                          lower_factor_strategy);
         // fill matrices
         exec->run(make_initialize_l(mtx.get(), l_mtx.get(), false));
         auto u_mtx = l_mtx->conj_transpose();

diff --git a/core/factorization/ilu.cpp b/core/factorization/ilu.cpp
@@ -10,6 +10,8 @@
 #include <ginkgo/core/base/exception_helpers.hpp>
 #include <ginkgo/core/config/config.hpp>
 #include <ginkgo/core/config/registry.hpp>
+#include <ginkgo/core/factorization/lu.hpp>
+#include <ginkgo/core/matrix/sparsity_csr.hpp>
 
 #include "core/base/array_access.hpp"
 #include "core/config/config_helper.hpp"
@@ -24,7 +26,7 @@ namespace ilu_factorization {
 namespace {
 
 
-GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::compute_lu);
+GKO_REGISTER_OPERATION(compute_ilu, ilu_factorization::sparselib_ilu);
 GKO_REGISTER_OPERATION(add_diagonal_elements,
                        factorization::add_diagonal_elements);
 GKO_REGISTER_OPERATION(initialize_row_ptrs_l_u,
@@ -52,6 +54,17 @@ Ilu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("algorithm")) {
+        using gko::factorization::factorize_algorithm;
+        auto str = obj.get_string();
+        if (str == "sparselib") {
+            params.with_algorithm(factorize_algorithm::sparselib);
+        } else if (str == "syncfree") {
+            params.with_algorithm(factorize_algorithm::syncfree);
+        } else {
+            GKO_INVALID_CONFIG_VALUE("algorithm", str);
+        }
+    }
     return params;
 }
 
@@ -66,7 +79,7 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
 
     // Converts the system matrix to CSR.
     // Throws an exception if it is not convertible.
-    auto local_system_matrix = matrix_type::create(exec);
+    auto local_system_matrix = share(matrix_type::create(exec));
     as<ConvertibleTo<matrix_type>>(system_matrix.get())
         ->convert_to(local_system_matrix);
 
@@ -79,6 +92,27 @@ std::unique_ptr<Composition<ValueType>> Ilu<ValueType, IndexType>::generate_l_u(
         local_system_matrix.get(), false));
 
     // Compute LU factorization
+    if (std::dynamic_pointer_cast<const OmpExecutor>(exec) ||
+        parameters_.algorithm == factorize_algorithm::syncfree) {
+        auto sparsity =
+            share(gko::matrix::SparsityCsr<ValueType, IndexType>::create_const(
+                exec, local_system_matrix->get_size(),
+                make_const_array_view(
+                    exec, local_system_matrix->get_num_stored_elements(),
+                    local_system_matrix->get_const_col_idxs()),
+                make_const_array_view(
+                    exec, local_system_matrix->get_size()[0] + 1,
+                    local_system_matrix->get_const_row_ptrs())));
+        auto unpack =
+            gko::experimental::factorization::Lu<ValueType, IndexType>::build()
+                .with_has_all_fillin(false)
+                .with_symbolic_factorization(sparsity)
+                .on(exec)
+                ->generate(local_system_matrix)
+                ->unpack(parameters_.l_strategy, parameters_.u_strategy);
+        return Composition<ValueType>::create(unpack->get_lower_factor(),
+                                              unpack->get_upper_factor());
+    }
     exec->run(ilu_factorization::make_compute_ilu(local_system_matrix.get()));
 
     // Separate L and U factors: nnz

diff --git a/core/factorization/ilu_kernels.hpp b/core/factorization/ilu_kernels.hpp
@@ -20,14 +20,14 @@ namespace gko {
 namespace kernels {
 
 
-#define GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)  \
-    void compute_lu(std::shared_ptr<const DefaultExecutor> exec, \
-                    matrix::Csr<ValueType, IndexType>* system_matrix)
+#define GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)  \
+    void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec, \
+                       matrix::Csr<ValueType, IndexType>* system_matrix)
 
 
 #define GKO_DECLARE_ALL_AS_TEMPLATES                  \
     template <typename ValueType, typename IndexType> \
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL(ValueType, IndexType)
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL(ValueType, IndexType)
 
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(ilu_factorization,

diff --git a/core/factorization/lu.cpp b/core/factorization/lu.cpp
@@ -69,6 +69,9 @@ Lu<ValueType, IndexType>::parse(const config::pnode& config,
     if (auto& obj = config.get("skip_sorting")) {
         params.with_skip_sorting(config::get_value<bool>(obj));
     }
+    if (auto& obj = config.get("has_all_fillin")) {
+        params.with_has_all_fillin(config::get_value<bool>(obj));
+    }
 
     return params;
 }
@@ -151,19 +154,15 @@ std::unique_ptr<LinOp> Lu<ValueType, IndexType>::generate_impl(
         factors->get_const_row_ptrs(), factors->get_const_col_idxs(), num_rows,
         allowed_sparsity, storage_offsets.get_const_data(),
         row_descs.get_data(), storage.get_data()));
-    // initialize factors
-    exec->run(make_fill_array(factors->get_values(),
-                              factors->get_num_stored_elements(),
-                              zero<ValueType>()));
     exec->run(make_initialize(
         mtx.get(), storage_offsets.get_const_data(), row_descs.get_const_data(),
         storage.get_const_data(), diag_idxs.get_data(), factors.get()));
     // run numerical factorization
     array<int> tmp{exec};
-    exec->run(make_factorize(storage_offsets.get_const_data(),
-                             row_descs.get_const_data(),
-                             storage.get_const_data(),
-                             diag_idxs.get_const_data(), factors.get(), tmp));
+    exec->run(make_factorize(
+        storage_offsets.get_const_data(), row_descs.get_const_data(),
+        storage.get_const_data(), diag_idxs.get_const_data(), factors.get(),
+        parameters_.has_all_fillin, tmp));
     return factorization_type::create_from_combined_lu(std::move(factors));
 }
 

diff --git a/core/factorization/lu_kernels.hpp b/core/factorization/lu_kernels.hpp
@@ -33,7 +33,7 @@ namespace kernels {
                    const IndexType* lookup_offsets, const int64* lookup_descs, \
                    const int32* lookup_storage, const IndexType* diag_idxs,    \
                    matrix::Csr<ValueType, IndexType>* factors,                 \
-                   array<int>& tmp_storage)
+                   bool has_all_fillin, array<int>& tmp_storage)
 
 
 #define GKO_DECLARE_LU_SYMMETRIC_FACTORIZE_SIMPLE(IndexType)                  \

diff --git a/core/matrix/csr_lookup.hpp b/core/matrix/csr_lookup.hpp
@@ -183,7 +183,8 @@ struct device_sparsity_lookup {
             result = lookup_search_unsafe(col);
             break;
         }
-        GKO_ASSERT(local_cols[result] == col);
+        GKO_ASSERT(result >= 0 && result < row_nnz &&
+                   local_cols[result] == col);
         return result;
     }
 
@@ -230,7 +231,8 @@ struct device_sparsity_lookup {
         const auto out_idx =
             block_bases[block] +
             gko::detail::popcount(block_bitmaps[block] & prefix_mask);
-        GKO_ASSERT(local_cols[out_idx] == col);
+        GKO_ASSERT(out_idx >= 0 && out_idx < row_nnz &&
+                   local_cols[out_idx] == col);
         return out_idx;
     }
 
@@ -262,15 +264,17 @@ struct device_sparsity_lookup {
             (static_cast<unsigned_index_type>(col) * hash_param) % hashmap_size;
         GKO_ASSERT(hashmap[hash] >= 0);
         GKO_ASSERT(hashmap[hash] < row_nnz);
-        while (local_cols[hashmap[hash]] != col) {
+        auto out_idx = hashmap[hash];
+        // linear probing with invalid_index sentinel to avoid infinite loop
+        while (out_idx >= 0 && local_cols[out_idx] != col) {
             hash++;
             if (hash >= hashmap_size) {
                 hash = 0;
             }
-            GKO_ASSERT(hashmap[hash] >= 0);
+            out_idx = hashmap[hash];
             GKO_ASSERT(hashmap[hash] < row_nnz);
         }
-        const auto out_idx = hashmap[hash];
+        // out_idx is either correct or invalid_index, the hashmap sentinel
         return out_idx;
     }
 

diff --git a/core/test/config/factorization.cpp b/core/test/config/factorization.cpp
@@ -111,6 +111,8 @@ struct Ilu : FactorizationConfigTest<gko::factorization::Ilu<float, int>,
                 typename gko::matrix::Csr<float, int>::sparselib>());
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
+        config_map["algorithm"] = pnode{"syncfree"};
+        param.with_algorithm(gko::factorization::factorize_algorithm::syncfree);
     }
 
     template <typename AnswerType>
@@ -122,6 +124,7 @@ struct Ilu : FactorizationConfigTest<gko::factorization::Ilu<float, int>,
         check_strategy(res_param.l_strategy, ans_param.l_strategy);
         check_strategy(res_param.u_strategy, ans_param.u_strategy);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.algorithm, ans_param.algorithm);
     }
 };
 
@@ -178,6 +181,8 @@ struct Lu : FactorizationConfigTest<
             gko::experimental::factorization::symbolic_type::near_symmetric);
         config_map["skip_sorting"] = pnode{true};
         param.with_skip_sorting(true);
+        config_map["has_all_fillin"] = pnode{false};
+        param.with_has_all_fillin(false);
     }
 
     template <typename AnswerType>
@@ -190,6 +195,7 @@ struct Lu : FactorizationConfigTest<
                   ans_param.symbolic_factorization);
         ASSERT_EQ(res_param.symbolic_algorithm, ans_param.symbolic_algorithm);
         ASSERT_EQ(res_param.skip_sorting, ans_param.skip_sorting);
+        ASSERT_EQ(res_param.has_all_fillin, ans_param.has_all_fillin);
     }
 };
 

diff --git a/dpcpp/factorization/ilu_kernels.dp.cpp b/dpcpp/factorization/ilu_kernels.dp.cpp
@@ -17,11 +17,11 @@ namespace ilu_factorization {
 
 
 template <typename ValueType, typename IndexType>
-void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
-                matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
+void sparselib_ilu(std::shared_ptr<const DefaultExecutor> exec,
+                   matrix::Csr<ValueType, IndexType>* m) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
-    GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);
+    GKO_DECLARE_ILU_SPARSELIB_ILU_KERNEL);
 
 
 }  // namespace ilu_factorization

diff --git a/dpcpp/factorization/lu_kernels.dp.cpp b/dpcpp/factorization/lu_kernels.dp.cpp
@@ -39,7 +39,7 @@ template <typename ValueType, typename IndexType>
 void factorize(std::shared_ptr<const DefaultExecutor> exec,
                const IndexType* lookup_offsets, const int64* lookup_descs,
                const int32* lookup_storage, const IndexType* diag_idxs,
-               matrix::Csr<ValueType, IndexType>* factors,
+               matrix::Csr<ValueType, IndexType>* factors, bool has_all_fillin,
                array<int>& tmp_storage) GKO_NOT_IMPLEMENTED;
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);