diff --git a/.github/workflows/ci_coverage.yml b/.github/workflows/ci_coverage.yml index a68fa8d5..eeef7123 100644 --- a/.github/workflows/ci_coverage.yml +++ b/.github/workflows/ci_coverage.yml @@ -51,7 +51,7 @@ jobs: uses: seqan/actions/setup-toolchain@main with: compiler: ${{ matrix.compiler }} - ccache_size: 125M + ccache_size: 200M - name: Install CMake uses: seqan/actions/setup-cmake@main @@ -68,7 +68,7 @@ jobs: mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DCMAKE_CXX_FLAGS="-Wno-interference-size" \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-interference-size" \ -DCHOPPER_NATIVE_BUILD=OFF make -j2 gtest_build @@ -93,8 +93,7 @@ jobs: ${{ github.workspace }}/build \ --filter ${{ github.workspace }}/include \ --filter ${{ github.workspace }}/src \ - --exclude ${{ github.workspace }}/src/measure_hyperloglog.cpp \ - --exclude ${{ github.workspace }}/src/display_layout \ + --exclude ${{ github.workspace }}/src/util \ --exclude-lines-by-pattern '^\s*$' \ --exclude-lines-by-pattern '^\s*};$' \ --exclude-lines-by-pattern '^.*GCOVR_EXCL_LINE.*$' \ diff --git a/.github/workflows/ci_linux.yml b/.github/workflows/ci_linux.yml index fb2e3264..8093bccf 100644 --- a/.github/workflows/ci_linux.yml +++ b/.github/workflows/ci_linux.yml @@ -31,17 +31,25 @@ jobs: fail-fast: true matrix: include: + - name: "clang17" + compiler: "clang-17" + build: unit + build_type: Release + - name: "gcc13" compiler: "gcc-13" build_type: Release + cxx_flags: -Wno-interference-size - name: "gcc12" compiler: "gcc-12" build_type: Release + cxx_flags: -Wno-interference-size - name: "gcc11" compiler: "gcc-11" build_type: Release + cxx_flags: -Wno-interference-size steps: - name: Checkout @@ -54,7 +62,11 @@ jobs: uses: seqan/actions/setup-toolchain@main with: compiler: ${{ matrix.compiler }} - ccache_size: 75M + ccache_size: 125M + + - name: Install OpenMP + if: contains(matrix.name, 'clang') + run: install libomp-17-dev - name: Install CMake uses: seqan/actions/setup-cmake@main @@ -66,7 +78,7 @@ jobs: mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DCMAKE_CXX_FLAGS="-Wno-interference-size" \ + -DCMAKE_CXX_FLAGS="-Werror ${{ matrix.cxx_flags }}" \ -DCHOPPER_NATIVE_BUILD=OFF make -j2 gtest_build diff --git a/.github/workflows/ci_macos.yml b/.github/workflows/ci_macos.yml index a490bed7..576872f1 100644 --- a/.github/workflows/ci_macos.yml +++ b/.github/workflows/ci_macos.yml @@ -54,7 +54,7 @@ jobs: uses: seqan/actions/setup-toolchain@main with: compiler: ${{ matrix.compiler }} - ccache_size: 75M + ccache_size: 125M - name: Install CMake uses: seqan/actions/setup-cmake@main @@ -66,7 +66,7 @@ jobs: mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DCMAKE_CXX_FLAGS="-Wno-interference-size" \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-interference-size" \ -DCHOPPER_NATIVE_BUILD=OFF make -j3 gtest_build diff --git a/.github/workflows/ci_misc.yml b/.github/workflows/ci_misc.yml index ee6514d9..d8552789 100644 --- a/.github/workflows/ci_misc.yml +++ b/.github/workflows/ci_misc.yml @@ -64,7 +64,7 @@ jobs: mkdir build cd build cmake .. -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \ - -DCMAKE_CXX_FLAGS="-Wno-interference-size" \ + -DCMAKE_CXX_FLAGS="-Werror -Wno-interference-size" \ -DCHOPPER_HEADER_TEST_ONLY=ON \ -DCHOPPER_NATIVE_BUILD=OFF diff --git a/.gitmodules b/.gitmodules index d446a687..cdee4a2c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,9 +1,6 @@ [submodule "lib/seqan3"] path = lib/seqan3 url = https://github.com/seqan/seqan3.git -[submodule "lib/robin-hood-hashing"] - path = lib/robin-hood-hashing - url = https://github.com/martinus/robin-hood-hashing [submodule "lib/simde"] path = lib/simde url = https://github.com/simd-everywhere/simde diff --git a/include/chopper/configuration.hpp b/include/chopper/configuration.hpp index 309cd91c..346f443f 100644 --- a/include/chopper/configuration.hpp +++ b/include/chopper/configuration.hpp @@ -7,15 +7,11 @@ #pragma once -#include +#include #include +#include -#include #include -#include -#include - -#include #include // IWYU pragma: keep #include @@ -70,48 +66,9 @@ struct configuration //!\brief The HIBF config which will be used to compute the layout within the HIBF lib. seqan::hibf::config hibf_config; - void read_from(std::istream & stream) - { - std::string line; - std::stringstream config_str; - - while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_config_start) - ; - - assert(line == chopper::prefix::meta_chopper_config_start); - - while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_config_end) - { - assert(line.size() >= 2); - assert(std::string_view{line}.substr(0, 1) == seqan::hibf::prefix::meta_header); - config_str << line.substr(1); // remove seqan::hibf::prefix::meta_header - } - - assert(line == chopper::prefix::meta_chopper_config_end); + void read_from(std::istream & stream); - cereal::JSONInputArchive iarchive(config_str); - iarchive(*this); - - hibf_config.read_from(stream); - } - - void write_to(std::ostream & stream) const - { - // write json file to temprorary string stream with cereal - std::stringstream config_stream{}; - cereal::JSONOutputArchive output(config_stream); // stream to cout - output(cereal::make_nvp("chopper_config", *this)); - - // write config - stream << chopper::prefix::meta_chopper_config_start << '\n'; - std::string line; - while (std::getline(config_stream, line, '\n')) - stream << seqan::hibf::prefix::meta_header << line << '\n'; - stream << seqan::hibf::prefix::meta_header << "}\n" // last closing bracket isn't written by loop above - << chopper::prefix::meta_chopper_config_end << '\n'; - - hibf_config.write_to(stream); - } + void write_to(std::ostream & stream) const; private: friend class cereal::access; diff --git a/include/chopper/input_functor.hpp b/include/chopper/input_functor.hpp index 8f1bc413..83c6a35f 100644 --- a/include/chopper/input_functor.hpp +++ b/include/chopper/input_functor.hpp @@ -7,15 +7,14 @@ #pragma once +#include +#include #include #include #include -#include -#include - -#include +#include namespace chopper { @@ -38,36 +37,7 @@ struct input_functor uint8_t kmer_size{21}; - void operator()(size_t const num, seqan::hibf::insert_iterator it) - { - assert(filenames.size() > num); - if (input_are_precomputed_files) - { - uint64_t hash{}; - char * const hash_data{reinterpret_cast(&hash)}; - std::streamsize const hash_bytes{sizeof(hash)}; - - std::ifstream infile{filenames[num], std::ios::binary}; - - while (infile.read(hash_data, hash_bytes)) - it = hash; - } - else - { - sequence_file_type fin{filenames[num]}; - - seqan3::shape shape = seqan3::ungapped{kmer_size}; - auto minimizer_view = seqan3::views::minimiser_hash(shape, - seqan3::window_size{kmer_size}, - seqan3::seed{adjust_seed(shape.count())}); - - for (auto && [seq] : fin) - { - for (auto hash_value : seq | minimizer_view) - it = hash_value; - } - } - } + void operator()(size_t const num, seqan::hibf::insert_iterator it); }; } // namespace chopper diff --git a/include/chopper/layout/determine_best_number_of_technical_bins.hpp b/include/chopper/layout/determine_best_number_of_technical_bins.hpp new file mode 100644 index 00000000..d085c331 --- /dev/null +++ b/include/chopper/layout/determine_best_number_of_technical_bins.hpp @@ -0,0 +1,22 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include + +#include + +#include +#include + +namespace chopper::layout +{ + +std::pair> +determine_best_number_of_technical_bins(chopper::configuration & config); + +} diff --git a/include/chopper/layout/execute.hpp b/include/chopper/layout/execute.hpp index 26ef8491..84642c8e 100644 --- a/include/chopper/layout/execute.hpp +++ b/include/chopper/layout/execute.hpp @@ -7,6 +7,9 @@ #pragma once +#include +#include + #include namespace chopper::layout diff --git a/include/chopper/layout/hibf_statistics.hpp b/include/chopper/layout/hibf_statistics.hpp index 5d32eea2..81aeca40 100644 --- a/include/chopper/layout/hibf_statistics.hpp +++ b/include/chopper/layout/hibf_statistics.hpp @@ -9,18 +9,16 @@ #include #include -#include -#include +#include +#include #include #include +#include +#include #include #include -#include -#include -#include -#include #include #include @@ -50,12 +48,12 @@ namespace chopper::layout class hibf_statistics { public: - hibf_statistics() = default; //!< Defaulted. - hibf_statistics(hibf_statistics const & b) = default; //!< Defaulted. - hibf_statistics & operator=(hibf_statistics const &) = default; //!< Defaulted. - hibf_statistics(hibf_statistics && b) = default; //!< Defaulted. - hibf_statistics & operator=(hibf_statistics &&) = default; //!< Defaulted. - ~hibf_statistics() = default; //!< Defaulted. + hibf_statistics() = delete; //!< Deleted. Holds reference members. + hibf_statistics(hibf_statistics const & b) = delete; //!< Deleted. Holds const member. + hibf_statistics & operator=(hibf_statistics const &) = delete; //!< Deleted. Holds const member. + hibf_statistics(hibf_statistics && b) = delete; //!< Deleted. Holds const member. + hibf_statistics & operator=(hibf_statistics &&) = delete; //!< Deleted. Holds const member. + ~hibf_statistics() = default; //!< Defaulted. /*!\brief Construct an empty HIBF with an empty top level IBF * \param[in] config_ User configuration for the HIBF. @@ -64,18 +62,10 @@ class hibf_statistics */ hibf_statistics(configuration const & config_, std::vector const & sketches_, - std::vector const & kmer_counts) : - config{config_}, - fp_correction{ - seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_false_positive_rate, - .hash_count = config_.hibf_config.number_of_hash_functions, - .t_max = config_.hibf_config.tmax})}, - sketches{sketches_}, - counts{kmer_counts}, - total_kmer_count{std::accumulate(kmer_counts.begin(), kmer_counts.end(), size_t{})} - {} - - struct bin; // forward declaration + std::vector const & kmer_counts); + + //!\brief Represents a (set) of user bins (see ibf_statistics::bin_kind). + class bin; //!\brief A representation of an IBF level that gathers information about bins in an IBF. struct level @@ -94,326 +84,20 @@ class hibf_statistics merged //!< Multiple user bins are merged into a single technical bin. }; - //!\brief Represents a (set) of user bins (see ibf_statistics::bin_kind). - class bin - { - public: - bin_kind kind; //!< Either a split or merged bin. - size_t cardinality; //!< The size/weight of the bin (either a kmer count or hll sketch estimation). - size_t num_contained_ubs; //!< [MERGED] How many UBs are merged within this TB. - size_t num_spanning_tbs; //!< [SPLIT] How many TBs are used for this sindle UB. - std::vector user_bin_indices; //!< The user bin indices of this bin. - size_t tb_index; // The (first) technical bin idx this bin is stored in. - level child_level; //!< [MERGED] The lower level ibf statistics. - size_t child_level_idx; //!< [MERGED] The lower level ibf statistics. - - bin() = default; //!< Defaulted. - bin(bin const & b) = default; //!< Defaulted. - bin & operator=(bin const &) = default; //!< Defaulted. - bin(bin && b) = default; //!< Defaulted. - bin & operator=(bin &&) = default; //!< Defaulted. - ~bin() = default; //!< Defaulted. - - bin(bin_kind const kind_, size_t const spanning_tbs, std::vector const & user_bin_indices_) : - kind{kind_}, - num_contained_ubs{user_bin_indices_.size()}, - num_spanning_tbs{spanning_tbs}, - user_bin_indices{user_bin_indices_} - { - assert((kind == bin_kind::split && num_contained_ubs == 1u) - || (kind == bin_kind::merged && num_spanning_tbs == 1u)); - } - }; - //!\brief Gather all statistics to have all members ready. - void finalize() - { - collect_bins(); - - compute_cardinalities(top_level_ibf); - - compute_total_query_cost(top_level_ibf); - - gather_statistics(top_level_ibf, 0); - - expected_HIBF_query_cost = total_query_cost / total_kmer_count; - } + void finalize(); //!\brief Prints a column names of the summary to the command line. - static void print_header_to(std::ostream & stream, bool const verbose = true) - { - // print column names explanation in header - stream << "## ### Notation ###\n" - << "## X-IBF = An IBF with X number of bins.\n" - << "## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level.\n"; - - stream << "## ### Column Description ###\n" - "## tmax : The maximum number of technical bin on each level\n" - "## c_tmax : The technical extra cost of querying an tmax-IBF, compared to 64-IBF\n" - "## l_tmax : The estimated query cost for an tmax-HIBF, compared to an 64-HIBF\n" - "## m_tmax : The estimated memory consumption for an tmax-HIBF, compared to an 64-HIBF\n" - "## (l*m)_tmax : Computed by l_tmax * m_tmax\n" - "## size : The expected total size of an tmax-HIBF\n" - << ((verbose) ? "## uncorr_size : The expected size of an tmax-HIBF without FPR correction\n" : ""); - - // print column names - stream << "# tmax" << '\t' << "c_tmax" << '\t' << "l_tmax" << '\t' << "m_tmax" << '\t' << "(l*m)_tmax" << '\t' - << "size"; - - if (verbose) // uncorrected size and add level statistics - { - stream << '\t' << "uncorr_size" << '\t' << "level" << '\t' << "num_ibfs" << '\t' << "level_size" << '\t' - << "level_size_no_corr" << '\t' << "total_num_tbs" << '\t' << "avg_num_tbs" << '\t' - << "split_tb_percentage" << '\t' << "max_split_tb" << '\t' << "avg_split_tb" << '\t' << "max_factor" - << '\t' << "avg_factor"; - } - - stream << '\n'; - } + static void print_header_to(std::ostream & stream, bool const verbose = true); //!\brief Prints a tab-separated summary of the statistics of this HIBF to the command line. - void print_summary_to(size_t & t_max_64_memory, std::ostream & stream, bool const verbose = true) - { - if (summaries.empty()) - finalize(); - - if (t_max_64_memory == 0) - t_max_64_memory = total_hibf_size_in_byte(); - - double const relative_memory_size = total_hibf_size_in_byte() / static_cast(t_max_64_memory); - double const query_time_memory_usage_prod = expected_HIBF_query_cost * relative_memory_size; - - stream << std::fixed << std::setprecision(2); - - std::string level_str, num_ibfs_str, level_size_str, level_size_no_corr_str, total_num_tbs_str, avg_num_tbs_str, - split_tb_percentage_str, max_split_tb_str, avg_split_tb_str, max_factor_str, avg_factor_str; - - size_t total_size{}; - size_t total_size_no_corr{}; - - // go through each level and collect and output the statistics - auto to_string_with_precision = [](auto num) - { - std::stringstream ss; - ss << std::fixed << std::setprecision(2) << num; - return ss.str(); - }; - - for (auto const & [level, s] : summaries) - { - size_t const level_size = std::reduce(s.ibf_mem_size.begin(), s.ibf_mem_size.end()); - size_t const level_size_no_corr = std::reduce(s.ibf_mem_size_no_corr.begin(), s.ibf_mem_size_no_corr.end()); - - total_size += level_size; - total_size_no_corr += level_size_no_corr; - - size_t const total_num_tbs = std::reduce(s.num_tbs.begin(), s.num_tbs.end()); - - size_t const total_num_split_tbs = std::reduce(s.num_split_tbs.begin(), s.num_split_tbs.end()); - double const split_tb_percentage = 100.0 * static_cast(total_num_split_tbs) / total_num_tbs; - - size_t const max_split_bin_span = *std::max_element(s.max_split_tb_span.begin(), s.max_split_tb_span.end()); - -#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wrestrict" -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY - - level_str += ":" + to_string_with_precision(level); - num_ibfs_str += ":" + to_string_with_precision(s.num_ibfs); - level_size_str += ":" + to_formatted_BF_size(level_size); - level_size_no_corr_str += ":" + to_formatted_BF_size(level_size_no_corr); - total_num_tbs_str += ":" + to_string_with_precision(total_num_tbs); - avg_num_tbs_str += ":" + to_string_with_precision(total_num_tbs / s.num_ibfs); - split_tb_percentage_str += ":" + to_string_with_precision(split_tb_percentage); - -#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY -# pragma GCC diagnostic pop -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY - - // if there are no split bins on this level, the following statistics don't make sense - if (max_split_bin_span != 0) - { - size_t const total_num_split_ubs = std::reduce(s.num_split_ubs.begin(), s.num_split_ubs.end()); - double const avg_split_bin = - static_cast(total_num_split_tbs) / static_cast(total_num_split_ubs); - size_t const total_split_tb_kmers = std::reduce(s.split_tb_kmers.begin(), s.split_tb_kmers.end()); - double const avg_factor = - static_cast(std::reduce(s.split_tb_corr_kmers.begin(), s.split_tb_corr_kmers.end())) - / static_cast(total_split_tb_kmers); - - max_split_tb_str += ":" + to_string_with_precision(max_split_bin_span); - avg_split_tb_str += ":" + to_string_with_precision(avg_split_bin); - max_factor_str += ":" + to_string_with_precision((fp_correction)[max_split_bin_span]); - avg_factor_str += ":" + to_string_with_precision(avg_factor); - } - else - { - max_split_tb_str += ":-"; - avg_split_tb_str += ":-"; - max_factor_str += ":-"; - avg_factor_str += ":-"; - } - } - - stream << std::fixed << std::setprecision(2); - - stream /* tmax */ << config.hibf_config.tmax - << '\t' - /* c_tmax */ - << chopper::layout::ibf_query_cost::interpolated( - config.hibf_config.tmax, - config.hibf_config.maximum_false_positive_rate) - << '\t' - /* l_tmax */ - << expected_HIBF_query_cost - << '\t' /*relative to a 64 bin IBF*/ - /* m_tmax */ - << relative_memory_size - << '\t' /*relative to the 64 T_Max HIBF*/ - /* (l*m)tmax */ - << query_time_memory_usage_prod - << '\t' - /* corr. size */ - << to_formatted_BF_size(total_size) << ((verbose) ? '\t' : '\n'); - - if (verbose) - { - // uncorrected FPR - stream /*uncorr. size */ << to_formatted_BF_size(total_size_no_corr) << '\t'; - - // per level statistics: - stream /* level */ << level_str - << '\t' - /* num_ibfs */ - << num_ibfs_str - << '\t' - /* level_size */ - << level_size_str - << '\t' - /* level_size_no_corr */ - << level_size_no_corr_str - << '\t' - /* total_num_tbs */ - << total_num_tbs_str - << '\t' - /* avg_num_tbs */ - << avg_num_tbs_str - << '\t' - /* split_tb_percentage */ - << split_tb_percentage_str - << '\t' - /* max_split_tb */ - << max_split_tb_str - << '\t' - /* avg_split_tb */ - << avg_split_tb_str - << '\t' - /* max_factor */ - << max_factor_str - << '\t' - /* avg_factor */ - << avg_factor_str << '\n'; - } - } + void print_summary_to(size_t & t_max_64_memory, std::ostream & stream, bool const verbose = true); //!\brief Return the total corrected size of the HIBF in bytes - size_t total_hibf_size_in_byte() - { - if (summaries.empty()) - finalize(); - - size_t total_size{}; - - // go through each level and collect the memory sizes - for (auto const & [level, summary] : summaries) - { - (void)level; - - total_size += std::reduce(summary.ibf_mem_size.begin(), summary.ibf_mem_size.end()); - } - - return compute_bin_size(total_size) / 8; - } + size_t total_hibf_size_in_byte(); //!\brief Round bytes to the appropriate unit and convert to string with unit. - [[nodiscard]] static std::string byte_size_to_formatted_str(size_t const bytes) - { - size_t iterations{}; - size_t integer{bytes}; - - while (integer >> 10u && iterations < 6u) - { - integer >>= 10u; - ++iterations; - } - - // While this is a bit more involved, we can avoid using floating point numbers. - auto first_decimal_position = [&]() - { - assert(iterations > 0u); - size_t decimal{bytes}; - decimal -= integer << (iterations * 10u); // Substract bytes represented by integer, e.g. -5GiB - decimal >>= (iterations - 1u) * 10u; // Shift to next smallest unit, e.g. 800MiB - decimal = decimal * 1000u / 1024u; // Account for using decimal system, i.e. 800MiB != 0.8GiB - size_t const diff{decimal - (decimal / 100u) * 100u}; // We want to round up to 1 decimal position - uint32_t const round_up{diff >= 50u}; - decimal += round_up * 100u - diff; - decimal /= 100u; - return decimal; - }; - - auto formatted_string = [&]() - { - static constexpr int8_t int_to_char_offset{'0'}; // int 0 as char: char{0 + 48} = '0' - size_t const decimal = iterations ? first_decimal_position() : 0u; - assert(decimal <= 10u); - - if (!iterations) // No decimals for Bytes - return std::to_string(integer); - else if (decimal < 10u) // No need to round integer part - return std::to_string(integer) + '.' + static_cast(decimal + int_to_char_offset); - else // Round integer part, e.g., 5.99 MiB should report 6.0 MiB - { - ++integer; - // Check whether rounding results in a change of unit, e.g. 1023.99MiB to 1.0GiB - if (integer >> 10u) - { - ++iterations; - integer >>= 10u; - } - return std::to_string(integer) + ".0"; - } - }; - - std::string result{formatted_string()}; - switch (iterations) - { - case 0: - result += "Bytes"; - break; - case 1: - result += "KiB"; - break; - case 2: - result += "MiB"; - break; - case 3: - result += "GiB"; - break; - case 4: - result += "TiB"; - break; - case 5: - result += "PiB"; - break; - default: - result += "EiB"; - break; - } - - return result; - } + [[nodiscard]] static std::string byte_size_to_formatted_str(size_t const bytes); //!\brief The top level IBF of this HIBF, often starting point for recursions. level top_level_ibf; @@ -444,28 +128,7 @@ class hibf_statistics size_t const total_kmer_count{}; //!\brief Statistics for all IBFs on a certain level of the HIBF. - struct level_summary - { - size_t num_ibfs{}; - - std::vector num_tbs{}; - std::vector num_ubs{}; - - std::vector num_split_tbs{}; - std::vector num_merged_tbs{}; - - std::vector num_split_ubs{}; - std::vector num_merged_ubs{}; - - std::vector max_split_tb_span{}; - std::vector split_tb_corr_kmers{}; - std::vector split_tb_kmers{}; - - std::vector max_ubs_in_merged{}; - - std::vector ibf_mem_size{}; - std::vector ibf_mem_size_no_corr{}; - }; + struct level_summary; //!\brief The gathered summary of statistics for each level of this HIBF. std::map summaries; @@ -480,259 +143,79 @@ class hibf_statistics * ----------------------- * LN(1 - e^(LN(FPR) / HASHES) ) */ - size_t compute_bin_size(size_t const number_of_kmers_to_be_stored) const - { - return std::ceil( - -static_cast(number_of_kmers_to_be_stored * config.hibf_config.number_of_hash_functions) - / std::log(1 - - std::exp(std::log(config.hibf_config.maximum_false_positive_rate) - / config.hibf_config.number_of_hash_functions))); - } + size_t compute_bin_size(size_t const number_of_kmers_to_be_stored) const; /*!\brief Compute the Bloom Filter size from `number_of_kmers_to_be_stored` and * return it as a formatted string with the appropriate unit. * \param[in] number_of_kmers_to_be_stored */ - std::string to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const - { - size_t const size_in_bytes = compute_bin_size(number_of_kmers_to_be_stored) / 8; - return byte_size_to_formatted_str(size_in_bytes); - } + std::string to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const; - void collect_bins() - { - std::vector ibfs(hibf_layout.max_bins.size() + 1); // 0 = top_level - robin_hood::unordered_map, size_t> id_to_pos{}; - - // fill id_to_pos map - id_to_pos[std::vector{}] = 0; - for (size_t i = 0; i < hibf_layout.max_bins.size(); ++i) - id_to_pos[hibf_layout.max_bins[i].previous_TB_indices] = i + 1; - - for (auto const & user_bin_info : hibf_layout.user_bins) - { - std::vector prev{}; - - // add user bin index to previous merged bins - for (size_t i = 0; i < user_bin_info.previous_TB_indices.size(); ++i) - { - auto & ibf = ibfs[id_to_pos.at(prev)]; - auto const target_tb_index = user_bin_info.previous_TB_indices[i]; - - bool found_merged_bin{false}; - for (auto & previous_bins_to_check : ibf.bins) - { - if (previous_bins_to_check.tb_index == target_tb_index) - { - found_merged_bin = true; - previous_bins_to_check.user_bin_indices.push_back(user_bin_info.idx); - ++previous_bins_to_check.num_contained_ubs; - } - } - - if (!found_merged_bin) - { - ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); - ibf.bins.back().tb_index = target_tb_index; - auto next = prev; - next.push_back(target_tb_index); - ibf.bins.back().child_level_idx = id_to_pos.at(next); - } - prev.push_back(target_tb_index); - } - - // emplace a split bin at last since every user bin is on its lowest level single or split - auto & ibf = ibfs[id_to_pos.at(prev)]; - ibf.bins.emplace_back(hibf_statistics::bin_kind::split, - user_bin_info.number_of_technical_bins, - std::vector{user_bin_info.idx}); - ibf.bins.back().tb_index = user_bin_info.storage_TB_id; - } - - for (auto & ibf : ibfs) - for (auto & bin : ibf.bins) - if (bin.kind == hibf_statistics::bin_kind::merged) - bin.child_level = ibfs[bin.child_level_idx]; - - top_level_ibf = std::move(ibfs[0]); - } + void collect_bins(); - void compute_cardinalities(level & curr_level) - { - for (bin & current_bin : curr_level.bins) - { - if (current_bin.kind == bin_kind::merged) - { - if (config.hibf_config.disable_estimate_union) - { - size_t sum{}; - for (size_t i = 0; i < current_bin.user_bin_indices.size(); ++i) - sum += counts[current_bin.user_bin_indices[i]]; // TODO should be kmer_counts - current_bin.cardinality = sum; - } - else - { - assert(!current_bin.user_bin_indices.empty()); - seqan::hibf::sketch::hyperloglog hll = sketches[current_bin.user_bin_indices[0]]; - - for (size_t i = 1; i < current_bin.user_bin_indices.size(); ++i) - hll.merge(sketches[current_bin.user_bin_indices[i]]); - - current_bin.cardinality = hll.estimate(); - } - - compute_cardinalities(current_bin.child_level); - } - else if (current_bin.kind == bin_kind::split) // bin_kind::split - { - assert(current_bin.user_bin_indices.size() == 1); - current_bin.cardinality = counts[current_bin.user_bin_indices[0]]; - } - } - } + void compute_cardinalities(level & curr_level); //!\brief Computes the estimated query cost - void compute_total_query_cost(level & curr_level) - { - // Compute number of technical bins in current level (<= tmax) - size_t number_of_tbs{0}; - size_t level_kmer_count{0}; - size_t index{0}; - std::vector merged_bin_indices{}; - std::vector merged_bin_sketches{}; - - for (bin const & current_bin : curr_level.bins) - { - if (current_bin.kind == bin_kind::merged) - { - ++number_of_tbs; - merged_bin_indices.push_back(index); - - if (!config.hibf_config.disable_estimate_union) - { - // compute merged_bin_sketch - assert(!current_bin.user_bin_indices.empty()); - seqan::hibf::sketch::hyperloglog hll = sketches[current_bin.user_bin_indices[0]]; - - for (size_t i = 1; i < current_bin.user_bin_indices.size(); ++i) - hll.merge(sketches[current_bin.user_bin_indices[i]]); - - merged_bin_sketches.push_back(std::move(hll)); - } - } - else if (current_bin.kind == bin_kind::split) // bin_kind::split - { - number_of_tbs += current_bin.num_spanning_tbs; - level_kmer_count += current_bin.cardinality; - } - ++index; - } - assert(number_of_tbs <= config.hibf_config.tmax); - - // Add cost of querying the current IBF - // (how costly is querying number_of_tbs (e.g. 128 tbs) compared to 64 tbs given the current FPR) - curr_level.current_query_cost += - ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_false_positive_rate); - - // Add costs of querying the HIBF for each kmer in this level. - total_query_cost += curr_level.current_query_cost * level_kmer_count; - - // update query cost of all merged bins - for (size_t i = 0; i < merged_bin_indices.size(); ++i) - { - auto & current_bin = curr_level.bins[merged_bin_indices[i]]; - - // Pass on cost of querying the current level - current_bin.child_level.current_query_cost = curr_level.current_query_cost; - - // If merged bins share kmers, we need to penalize this - // because querying a kmer will result in multi level look-ups. - if (!config.hibf_config.disable_estimate_union) - { - double const current_estimate = merged_bin_sketches[i].estimate(); - - for (size_t j = i + 1; j < merged_bin_indices.size(); ++j) - { - seqan::hibf::sketch::hyperloglog tmp = - merged_bin_sketches[i]; // copy needed, s.t. current is not modified - double union_estimate = tmp.merge_and_estimate(merged_bin_sketches[j]); - // Jaccard distance estimate - double distance = 2.0 - (current_estimate + merged_bin_sketches[j].estimate()) / union_estimate; - // Since the sizes are estimates, the distance might be slighlty above 1.0 or below 0.0 - // but we need to avoid nagetive numbers - distance = std::min(std::max(distance, 0.0), 1.0); - - current_bin.child_level.current_query_cost += (1.0 - distance); - } - } - } - - // call function recursively for each merged bin - for (size_t i : merged_bin_indices) - compute_total_query_cost(curr_level.bins[i].child_level); - } + void compute_total_query_cost(level & curr_level); /*!\brief Recursively gather all the statistics from the bins. * \param[in] curr_level The current IBF from which the statistics will be extracted. * \param[in] level_summary_index The index of `curr_level` in `summeries`. */ - void gather_statistics(level const & curr_level, size_t const level_summary_index) - { - level_summary & summary = summaries[level_summary_index]; - summary.num_ibfs += 1; - - size_t max_cardinality{}, max_cardinality_no_corr{}, num_tbs{}, num_ubs{}, num_split_tbs{}, num_merged_tbs{}, - num_split_ubs{}, num_merged_ubs{}, max_split_tb_span{}, split_tb_kmers{}, max_ubs_in_merged{}, - split_tb_corr_kmers{}; - - for (bin const & current_bin : curr_level.bins) - { - size_t const cardinality_per_split_bin = - (current_bin.cardinality + current_bin.num_spanning_tbs - 1) / current_bin.num_spanning_tbs; // round up - size_t const corrected_cardinality = - std::ceil(cardinality_per_split_bin * (fp_correction)[current_bin.num_spanning_tbs]); - max_cardinality = std::max(max_cardinality, corrected_cardinality); - max_cardinality_no_corr = std::max(max_cardinality_no_corr, cardinality_per_split_bin); - - num_tbs += current_bin.num_spanning_tbs; - num_ubs += current_bin.num_contained_ubs; - - if (current_bin.kind == bin_kind::split) - { - num_split_tbs += current_bin.num_spanning_tbs; - num_split_ubs += 1; - split_tb_corr_kmers += corrected_cardinality * current_bin.num_spanning_tbs; - split_tb_kmers += cardinality_per_split_bin * current_bin.num_spanning_tbs; - max_split_tb_span = std::max(max_split_tb_span, current_bin.num_spanning_tbs); - } - else - { - num_merged_tbs += 1; - num_merged_ubs += current_bin.num_contained_ubs; - max_ubs_in_merged = std::max(max_ubs_in_merged, current_bin.num_contained_ubs); - - gather_statistics(current_bin.child_level, level_summary_index + 1); - } - } - - summary.num_tbs.push_back(num_tbs); - summary.num_ubs.push_back(num_ubs); - - summary.num_split_tbs.push_back(num_split_tbs); - summary.num_merged_tbs.push_back(num_merged_tbs); - - summary.num_split_ubs.push_back(num_split_ubs); - summary.num_merged_ubs.push_back(num_merged_ubs); - - summary.max_split_tb_span.push_back(max_split_tb_span); - summary.split_tb_corr_kmers.push_back(split_tb_corr_kmers); - summary.split_tb_kmers.push_back(split_tb_kmers); - - summary.max_ubs_in_merged.push_back(max_ubs_in_merged); - - summary.ibf_mem_size.push_back(max_cardinality * num_tbs); - summary.ibf_mem_size_no_corr.push_back(max_cardinality_no_corr * num_tbs); + void gather_statistics(level const & curr_level, size_t const level_summary_index); +}; + +class hibf_statistics::bin +{ +public: + bin_kind kind; //!< Either a split or merged bin. + size_t cardinality; //!< The size/weight of the bin (either a kmer count or hll sketch estimation). + size_t num_contained_ubs; //!< [MERGED] How many UBs are merged within this TB. + size_t num_spanning_tbs; //!< [SPLIT] How many TBs are used for this sindle UB. + std::vector user_bin_indices; //!< The user bin indices of this bin. + size_t tb_index; // The (first) technical bin idx this bin is stored in. + level child_level; //!< [MERGED] The lower level ibf statistics. + size_t child_level_idx; //!< [MERGED] The lower level ibf statistics. + + bin() = default; //!< Defaulted. + bin(bin const & b) = default; //!< Defaulted. + bin & operator=(bin const &) = default; //!< Defaulted. + bin(bin && b) = default; //!< Defaulted. + bin & operator=(bin &&) = default; //!< Defaulted. + ~bin() = default; //!< Defaulted. + + bin(bin_kind const kind_, size_t const spanning_tbs, std::vector const & user_bin_indices_) : + kind{kind_}, + num_contained_ubs{user_bin_indices_.size()}, + num_spanning_tbs{spanning_tbs}, + user_bin_indices{user_bin_indices_} + { + assert((kind == bin_kind::split && num_contained_ubs == 1u) + || (kind == bin_kind::merged && num_spanning_tbs == 1u)); } }; +struct hibf_statistics::level_summary +{ + size_t num_ibfs{}; + + std::vector num_tbs{}; + std::vector num_ubs{}; + + std::vector num_split_tbs{}; + std::vector num_merged_tbs{}; + + std::vector num_split_ubs{}; + std::vector num_merged_ubs{}; + + std::vector max_split_tb_span{}; + std::vector split_tb_corr_kmers{}; + std::vector split_tb_kmers{}; + + std::vector max_ubs_in_merged{}; + + std::vector ibf_mem_size{}; + std::vector ibf_mem_size_no_corr{}; +}; + } // namespace chopper::layout diff --git a/include/chopper/layout/ibf_query_cost.hpp b/include/chopper/layout/ibf_query_cost.hpp index 81d533ef..544a0b9b 100644 --- a/include/chopper/layout/ibf_query_cost.hpp +++ b/include/chopper/layout/ibf_query_cost.hpp @@ -10,8 +10,8 @@ #include #include #include +#include #include -#include namespace chopper::layout { @@ -28,45 +28,9 @@ class ibf_query_cost ibf_query_cost & operator=(ibf_query_cost &&) = default; ~ibf_query_cost() = default; - static double exact(size_t const t_max, double const fpr) - { - auto it = find_closest_fpr(fpr); - - if (contains(t_max)) - return it->second[position(t_max)]; - else - throw std::invalid_argument("No exact data available for this t_max."); - } + static double exact(size_t const t_max, double const fpr); - static double interpolated(size_t const t_max, double const fpr) - { - auto it = find_closest_fpr(fpr); - - if (t_max <= 64u) - { - return it->second[0]; - } - else if (t_max > maximum_t_max) - { - throw std::invalid_argument("No data available for a t_max this large."); - } - else if (contains(t_max)) - { - return it->second[position(t_max)]; - } - else - { - size_t const upper_bound{std::bit_ceil(t_max)}; - size_t const lower_bound{upper_bound >> 1}; - double const upper_value{it->second[position(upper_bound)]}; - double const lower_value{it->second[position(lower_bound)]}; - - double const interpolated_value{lower_value - + (upper_value - lower_value) * (t_max - lower_bound) / lower_bound}; - assert(interpolated_value <= upper_value); - return interpolated_value; - } - } + static double interpolated(size_t const t_max, double const fpr); private: /*!\brief The cost factor to penalize a search in an IBF with more then 64 bins. @@ -90,28 +54,7 @@ class ibf_query_cost {0.0625, {1.0000, 1.1011, 1.2670, 1.5964, 2.4030, 3.6996, 7.1772, 12.4852, 23.3882, 44.7427, 87.8259}}, {0.3125, {1.0000, 1.2818, 1.5493, 2.2546, 3.7804, 6.5428, 12.9410, 24.4539, 47.6262, 93.4733, 185.1019}}}; - static std::map>::const_iterator find_closest_fpr(double const fpr) - { - if (auto it = cost_factors.find(fpr); it != cost_factors.end()) // fpr is found exaclty in map - return it; - - // otherwise search for the closest one in the map - auto lower_it = cost_factors.lower_bound(fpr); - auto upper_it = cost_factors.upper_bound(fpr); - - assert(lower_it != cost_factors.end() || upper_it != cost_factors.end()); - - if (lower_it == cost_factors.end()) - return upper_it; - - if (upper_it == cost_factors.end()) - return lower_it; - - if (std::abs(lower_it->first - fpr) < std::abs(upper_it->first - fpr)) - return lower_it; - else - return upper_it; - } + static std::map>::const_iterator find_closest_fpr(double const fpr); static constexpr bool contains(size_t const value) { diff --git a/include/chopper/layout/input.hpp b/include/chopper/layout/input.hpp index ab64dd44..855856c1 100644 --- a/include/chopper/layout/input.hpp +++ b/include/chopper/layout/input.hpp @@ -7,70 +7,20 @@ #pragma once -#include -#include -#include - -#include +#include +#include +#include +#include #include -#include #include namespace chopper::layout { -inline std::vector> read_filenames_from(std::istream & stream) -{ - std::vector> filenames{}; - std::string line; - - while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_user_bins_start) - ; - - assert(line == chopper::prefix::meta_chopper_user_bins_start); - -#ifndef NDEBUG - size_t counter{}; -#endif - while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_user_bins_end) - { - assert(line.size() >= 2); - assert(std::string_view{line}.substr(0, 1) == seqan::hibf::prefix::meta_header); - - // @0 file1.fa file2.fa - auto const bin_idx_pos = line.find(' '); - assert(bin_idx_pos != std::string::npos); - -#ifndef NDEBUG - size_t bin_idx{}; - std::from_chars(line.data() + 1, line.data() + bin_idx_pos, bin_idx); - assert(bin_idx == counter++); -#endif - - filenames.emplace_back(); - std::string_view const filename_str{line.begin() + bin_idx_pos + 1, line.end()}; - for (auto const && filename : std::views::split(filename_str, ' ')) - { - auto common_view = std::views::common(filename); - filenames.back().emplace_back(common_view.begin(), common_view.end()); - } - } - - assert(line == chopper::prefix::meta_chopper_user_bins_end); - - return filenames; -} - -inline auto read_layout_file(std::istream & stream) -{ - std::vector> filenames = chopper::layout::read_filenames_from(stream); - chopper::configuration chopper_config; - chopper_config.read_from(stream); - seqan::hibf::layout::layout hibf_layout{}; - hibf_layout.read_from(stream); - return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(hibf_layout)); -} +std::vector> read_filenames_from(std::istream & stream); +std::tuple>, configuration, seqan::hibf::layout::layout> +read_layout_file(std::istream & stream); } // namespace chopper::layout diff --git a/include/chopper/layout/output.hpp b/include/chopper/layout/output.hpp index ea0ba9f3..a66aafd7 100644 --- a/include/chopper/layout/output.hpp +++ b/include/chopper/layout/output.hpp @@ -7,23 +7,13 @@ #pragma once -#include - -#include -#include - -#include +#include +#include +#include namespace chopper::layout { -inline void write_user_bins_to(std::vector const & filenames, std::ostream & stream) -{ - stream << chopper::prefix::meta_chopper_user_bins_start << '\n'; - size_t counter{}; - for (auto const & filename : filenames) - stream << seqan::hibf::prefix::meta_header << counter++ << ' ' << filename << '\n'; - stream << chopper::prefix::meta_chopper_user_bins_end << '\n'; -} +void write_user_bins_to(std::vector const & filenames, std::ostream & stream); } // namespace chopper::layout diff --git a/include/chopper/set_up_parser.hpp b/include/chopper/set_up_parser.hpp index 7fa97cf0..4f9adbe7 100644 --- a/include/chopper/set_up_parser.hpp +++ b/include/chopper/set_up_parser.hpp @@ -11,220 +11,9 @@ #include -inline void set_up_parser(sharg::parser & parser, chopper::configuration & config) +namespace chopper { - parser.info.version = "1.0.0"; - parser.info.author = "Svenja Mehringer"; - parser.info.email = "svenja.mehringer@fu-berlin.de"; - parser.info.short_description = "Compute an HIBF layout"; - parser.info.description.emplace_back("Computes an HIBF layout that tries to minimize the disk space consumption of " - "the resulting index. The space is estimated using a k-mer count per user " - "bin which represents the potential denisity in a technical bin in an " - "interleaved Bloom filter. You can pass the resulting layout to raptor " - "(https://github.com/seqan/raptor) to build the index and " - "conduct queries."); +void set_up_parser(sharg::parser & parser, configuration & config); - parser.info.synopsis.emplace_back( - " --input [--output ] [--threads ] [--kmer ] [--fpr ] [--hash ] " - "[--disable-estimate-union] [--disable-rearrangement]"); - - parser.add_subsection("Main options:"); - // ----------------------------------------------------------------------------------------------------------------- - parser.add_option( - config.data_file, - sharg::config{ - .short_id = '\0', - .long_id = "input", - .description = - "The input must be a file containing paths to sequence data you wish to estimate; one filepath " - "per line. If your file contains auxiliary information (e.g. species IDs), your file must be tab-" - "separated.", - .required = true}); - parser.add_list_item("", "Example file:"); - parser.add_list_item("", "```"); - parser.add_list_item("", "/absolute/path/to/file1.fasta"); - parser.add_list_item("", "/absolute/path/to/file2.fa.gz"); - parser.add_list_item("", "```"); - - parser.add_option( - config.k, - sharg::config{ - .short_id = '\0', - .long_id = "kmer", - .description = - "The k-mer size influences the size estimates of the input. " - "Choosing a k-mer size that is too small for " - "your data will result in files appearing more similar than they really are. Likewise, a large " - "k-mer size might miss out on certain similarities. For DNA sequences, a k-mer size between " - "[16,32] has proven to work well."}); - - parser.add_option( - config.hibf_config.tmax, - sharg::config{ - .short_id = '\0', - .long_id = "tmax", - .description = - "Limits the number of technical bins on each level of the HIBF. Choosing a good tmax is not " - "trivial. The smaller tmax, the more levels the layout needs to represent the data. This results " - "in a higher space consumption of the index. While querying each individual level is cheap, " - "querying many levels might also lead to an increased runtime. " - "A good tmax is usually the square root of the number of user bins/samples rounded to the next " - "multiple of 64. Note that your tmax will always be rounded to the next multiple of 64. " - "At the expense of a longer runtime, you can enable the statistic mode that determines the best " - "tmax for your data set. See the advanced option --determine-best-tmax", - .default_message = "≈sqrt(#samples)", - .advanced = true}); - - parser.add_option( - config.hibf_config.number_of_hash_functions, - sharg::config{.short_id = '\0', - .long_id = "hash", - .description = - "The number of hash functions to use when building the HIBF from the resulting layout. " - "This parameter is needed to correctly estimate the index size when computing the layout."}); - - parser.add_option( - config.hibf_config.maximum_false_positive_rate, - sharg::config{.short_id = '\0', - .long_id = "fpr", - .description = - "The false positive rate you aim for when building the HIBF from the resulting layout. " - "This parameter is needed to correctly estimate the index size when computing the layout."}); - - parser.add_option( - config.output_filename, - sharg::config{.short_id = '\0', .long_id = "output", .description = "A file name for the resulting layout."}); - - parser.add_option( - config.hibf_config.threads, - sharg::config{ - .short_id = '\0', - .long_id = "threads", - .description = - "The number of threads to use. Currently, only merging of sketches is parallelized, so if the flag " - "--disable-rearrangement is set, --threads will have no effect.", - .validator = - sharg::arithmetic_range_validator{static_cast(1), std::numeric_limits::max()}}); - - parser.add_subsection("HyperLogLog Sketches:"); - parser.add_line("To improve the layout, you can estimate the sequence similarities using HyperLogLog sketches."); - - parser.add_flag( - config.hibf_config.disable_estimate_union, - sharg::config{ - .short_id = '\0', - .long_id = "disable-estimate-union", - .description = - "The sketches are used to estimate the sequence similarity among a set of user bins. This will improve " - "the layout computation as merging user bins that do not increase technical bin sizes will be " - "preferred. This may use more RAM and can be disabled in RAM-critical environments. " - "Attention: Also disables rearrangement which depends on union estimations."}); - - parser.add_flag( - config.hibf_config.disable_rearrangement, - sharg::config{ - .short_id = '\0', - .long_id = "disable-rearrangement", - .description = - "As a preprocessing step, rearranging the order of the given user bins based on their sequence " - "similarity may lead to favourable small unions and thus a smaller index. " - "Depending on the number of input samples (user bins), this may be time-consuming and can thus be " - "disabled if a suboptimal layout is sufficient."}); - - parser.add_subsection("Parameter Tweaking:"); - // ----------------------------------------------------------------------------------------------------------------- - parser.add_option( - config.hibf_config.alpha, - sharg::config{ - .short_id = '\0', - .long_id = "alpha", - .description = - "The layout algorithm optimizes the space consumption of the resulting HIBF but currently has no " - "means of optimizing the runtime for querying such an HIBF. In general, the ratio of merged bins " - "and split bins influences the query time because a merged bin always triggers another search on " - "a lower level. To influence this ratio, alpha can be used. The higher alpha, the less merged " - "bins are chosen in the layout. This improves query times but leads to a bigger index.", - .advanced = true}); - - parser.add_option( - config.hibf_config.max_rearrangement_ratio, - sharg::config{ - .short_id = '\0', - .long_id = "max-rearrangement-ratio", - .description = - "When the flag --disable-rearrangement is \\fBnot\\fP set, this option can influence the rearrangement " - "algorithm. The algorithm only rearranges the order of user bins in fixed intervals. The higher " - "--max-rearrangement-ratio, the larger the intervals. This potentially improves the layout, but " - "increases the runtime of the layout algorithm.", - .advanced = true, - .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); - - parser.add_option( - config.hibf_config.sketch_bits, - sharg::config{.short_id = '\0', - .long_id = "sketch-bits", - .description = - "The number of bits the HyperLogLog sketch should use to distribute the values into bins.", - .advanced = true, - .validator = sharg::arithmetic_range_validator{5, 32}}); - - parser.add_subsection("Special options"); - // ----------------------------------------------------------------------------------------------------------------- - parser.add_flag( - config.determine_best_tmax, - sharg::config{ - .short_id = '\0', - .long_id = "determine-best-tmax", - .description = - "When this flag is set, the program will compute multiple layouts for tmax in " - "[64 , 128, 256, ... , tmax] as well as tmax=sqrt(#samples). " - "The layout algorithm itself only optimizes the space consumption. When determining the best " - "layout, we additionally keep track of the average number of queries needed to traverse each " - "layout. This query cost is taken into account when determining the best tmax for your data. " - "Note that the option --tmax serves as upper bound. Once the layout quality starts dropping, the " - "computation is stopped. To run all layout computations, pass the flag --force-all-binnings.", - .advanced = true}); - - parser.add_flag( - config.force_all_binnings, - sharg::config{ - .short_id = '\0', - .long_id = "force-all-binnings", - .description = - "Forces all layouts up to --tmax to be computed, " - "regardless of the layout quality. If the flag --determine-best-tmax is not set, this flag is " - "ignored and has no effect.", - .advanced = true}); - - parser.add_flag( - config.output_verbose_statistics, - sharg::config{.short_id = '\0', - .long_id = "output-verbose-statistics", - .description = - "Enable verbose statistics to be " - "printed to std::cout. If the flag --determine-best-tmax is not set, this flag is ignored " - "and has no effect.", - .hidden = true}); - - parser.add_option( - config.sketch_directory, - sharg::config{ - .long_id = "output-sketches-to", - .description = - "If you supply a directory path with this option, the hyperloglog sketches of your input will be " - "stored in the respective path; one .hll file per input file.", - .default_message = "None", - .advanced = true}); - - parser.add_flag(config.debug, - sharg::config{.short_id = '\0', - .long_id = "debug", - .description = "Enables debug output in layout file.", - .hidden = true}); - - parser.add_section("References"); - parser.add_line("[1] Philippe Flajolet, Éric Fusy, Olivier Gandouet, Frédéric Meunier. HyperLogLog: the analysis " - "of a near-optimal cardinality estimation algorithm. AofA: Analysis of Algorithms, Jun 2007, Juan " - "les Pins, France. pp.137-156. hal-00406166v2, https://doi.org/10.46298/dmtcs.3545"); } diff --git a/include/chopper/sketch/check_filenames.hpp b/include/chopper/sketch/check_filenames.hpp index 8377d8db..c07e57c4 100644 --- a/include/chopper/sketch/check_filenames.hpp +++ b/include/chopper/sketch/check_filenames.hpp @@ -7,64 +7,15 @@ #pragma once -#include - -#include +#include +#include #include -#include namespace chopper::sketch { //!\brief Checks the `filenames` for consistent files, either precomputed or sequence files. -inline void check_filenames(std::vector const & filenames, configuration & config) -{ - assert(!filenames.empty()); - - auto case_insensitive_string_ends_with = [](std::string_view str, std::string_view suffix) - { - size_t const suffix_length{suffix.size()}; - size_t const str_length{str.size()}; - - if (suffix_length > str_length) - return false; // GCOVR_EXCL_LINE - - for (size_t j = 0, s_start = str_length - suffix_length; j < suffix_length; ++j) - if (std::tolower(str[s_start + j]) != std::tolower(suffix[j])) - return false; - - return true; - }; - - // If the first filename ends in .minimiser we expect all files to end in .minimiser - config.precomputed_files = case_insensitive_string_ends_with(filenames[0], ".minimiser"); - - for (auto const & filename : filenames) - { -#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY -# pragma GCC diagnostic push -# pragma GCC diagnostic ignored "-Wrestrict" -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY - if (!std::filesystem::exists(filename)) - throw std::invalid_argument{"File " + filename + " does not exist!"}; -#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY -# pragma GCC diagnostic pop -#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY - - if (config.precomputed_files && !case_insensitive_string_ends_with(filename, ".minimiser")) - { - throw std::invalid_argument{"You are providing precomputed files but the file " + filename - + " does not have the correct file extension (.minimiser)." - " Mixing non-/precomputed files is not allowed."}; - } - else if (!config.precomputed_files && case_insensitive_string_ends_with(filename, ".minimiser")) - { - throw std::invalid_argument{"You are providing sequence files but the file " + filename - + " was identified as a precomputed file (.minimiser)." - " Mixing non-/precomputed files is not allowed."}; - } - } -} +void check_filenames(std::vector const & filenames, configuration & config); } // namespace chopper::sketch diff --git a/include/chopper/sketch/output.hpp b/include/chopper/sketch/output.hpp index b403778d..556a0eb2 100644 --- a/include/chopper/sketch/output.hpp +++ b/include/chopper/sketch/output.hpp @@ -7,12 +7,12 @@ #pragma once -#include +#include +#include #include +#include #include -#include - #include #include @@ -20,27 +20,12 @@ namespace chopper::sketch { -inline void write_count_file_line(std::pair> const & cluster, - uint64_t const weight, - std::ofstream & fout) -{ - auto & [key, filepaths] = cluster; - - for (auto && arr : filepaths | seqan3::views::join_with(';')) - fout << arr; +void write_count_file_line(std::pair> const & cluster, + uint64_t const weight, + std::ofstream & fout); - fout << '\t' << weight << '\t' << key << '\n'; -} - -inline void write_sketch_file(std::string const & filename, - seqan::hibf::sketch::hyperloglog const & sketch, - configuration const & config) -{ - // For one file in the cluster, the file stem is used with the .hll ending - std::filesystem::path path = config.sketch_directory / std::filesystem::path(filename).stem(); - path += ".hll"; - std::ofstream hll_fout(path, std::ios::binary); - sketch.store(hll_fout); -} +void write_sketch_file(std::string const & filename, + seqan::hibf::sketch::hyperloglog const & sketch, + configuration const & config); } // namespace chopper::sketch diff --git a/include/chopper/sketch/read_data_file.hpp b/include/chopper/sketch/read_data_file.hpp index ef1e017c..a63f9f50 100644 --- a/include/chopper/sketch/read_data_file.hpp +++ b/include/chopper/sketch/read_data_file.hpp @@ -7,38 +7,14 @@ #pragma once -#include - -#include +#include +#include #include namespace chopper::sketch { -inline void read_data_file(configuration const & config, std::vector & filenames) -{ - std::ifstream fin{config.data_file.string()}; - - if (!fin.good() || !fin.is_open()) - throw std::runtime_error{"Could not open data file " + config.data_file.string() + " for reading."}; - - std::string line; - while (std::getline(fin, line)) - { - auto tab_pos = line.find('\t'); - - if (tab_pos == std::string::npos) - { - std::string const filename{line.begin(), line.end()}; - filenames.push_back(filename); - } - else - { - std::string const filename{line.begin(), line.begin() + tab_pos}; - filenames.push_back(filename); - } - } -} +void read_data_file(configuration const & config, std::vector & filenames); } // namespace chopper::sketch diff --git a/include/chopper/sketch/read_hll_files_into.hpp b/include/chopper/sketch/read_hll_files_into.hpp index f05ffca2..ec9825c3 100644 --- a/include/chopper/sketch/read_hll_files_into.hpp +++ b/include/chopper/sketch/read_hll_files_into.hpp @@ -7,9 +7,7 @@ #pragma once -#include #include -#include #include #include @@ -18,35 +16,8 @@ namespace chopper::sketch { -inline void read_hll_files_into(std::filesystem::path const & hll_dir, - std::vector const & target_filenames, - std::vector & target) -{ - assert(std::filesystem::exists(hll_dir) && !std::filesystem::is_empty(hll_dir)); // checked in chopper_layout - - target.reserve(target_filenames.size()); - - try - { - for (auto const & filename : target_filenames) - { - std::filesystem::path path = hll_dir / std::filesystem::path(filename).stem(); - path += ".hll"; - std::ifstream hll_fin(path, std::ios::binary); - - if (!hll_fin.good()) - throw std::runtime_error{"Could not open file " + path.string()}; - - // the sketch bits will be automatically read from the files - target.emplace_back().load(hll_fin); - } - } - catch (std::runtime_error const & err) - { - std::string const chopper_msg{"[CHOPPER LAYOUT ERROR] Something went wrong trying to read the HyperLogLog" - " sketches from files:\n"}; - throw std::runtime_error{chopper_msg + err.what()}; - } -} +void read_hll_files_into(std::filesystem::path const & hll_dir, + std::vector const & target_filenames, + std::vector & target); } // namespace chopper::sketch diff --git a/lib/robin-hood-hashing b/lib/robin-hood-hashing deleted file mode 160000 index 9145f963..00000000 --- a/lib/robin-hood-hashing +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 9145f963d80d6a02f0f96a47758050a89184a3ed diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a57acc4b..1913ce6a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -7,29 +7,26 @@ cmake_minimum_required (VERSION 3.18) -add_library (chopper_interface INTERFACE) -target_link_libraries (chopper_interface INTERFACE seqan3::seqan3) -target_link_libraries (chopper_interface INTERFACE sharg::sharg) -target_link_libraries (chopper_interface INTERFACE "seqan::hibf") -target_include_directories (chopper_interface INTERFACE ../include) -target_include_directories (chopper_interface INTERFACE ${CHOPPER_SUBMODULES_DIR}/robin-hood-hashing/src/include) -target_include_directories (chopper_interface INTERFACE ${CHOPPER_SUBMODULES_DIR}/simde/simde) -target_compile_options (chopper_interface INTERFACE "-pedantic" "-Wall" "-Wextra") - -add_library (chopper_layout_lib STATIC chopper_layout.cpp) -target_link_libraries (chopper_layout_lib "chopper_interface") +if (NOT TARGET chopper_shared) + add_library (chopper_shared STATIC configuration.cpp input_functor.cpp) + + target_link_libraries (chopper_shared PUBLIC seqan3::seqan3) + target_link_libraries (chopper_shared PUBLIC sharg::sharg) + target_link_libraries (chopper_shared PUBLIC seqan::hibf) + target_include_directories (chopper_shared PUBLIC ../include) + target_include_directories (chopper_shared PUBLIC ${CHOPPER_SUBMODULES_DIR}/simde/simde) + target_compile_options (chopper_shared PUBLIC "-pedantic" "-Wall" "-Wextra") +endif () add_library (chopper_lib INTERFACE) -target_link_libraries (chopper_lib INTERFACE "chopper_interface" "chopper_layout_lib") - -add_executable (chopper chopper.cpp) -target_link_libraries (chopper "chopper_lib") +target_link_libraries (chopper_lib INTERFACE chopper_layout chopper_sketch) -add_executable (measure_hyperloglog EXCLUDE_FROM_ALL measure_hyperloglog.cpp) -target_link_libraries (measure_hyperloglog "chopper_interface") -target_compile_options (measure_hyperloglog PRIVATE "-Werror") +add_executable (chopper chopper.cpp set_up_parser.cpp) +target_link_libraries (chopper PUBLIC chopper_lib) -add_subdirectory (display_layout) +add_subdirectory (layout) +add_subdirectory (sketch) +add_subdirectory (util) if (CHOPPER_INSTALL) install (TARGETS chopper RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}") diff --git a/src/chopper.cpp b/src/chopper.cpp index 34eb5ace..c8da4697 100644 --- a/src/chopper.cpp +++ b/src/chopper.cpp @@ -5,9 +5,14 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- -#include +#include +#include +#include +#include +#include +#include -#include +#include #include #include diff --git a/src/configuration.cpp b/src/configuration.cpp new file mode 100644 index 00000000..dde603ca --- /dev/null +++ b/src/configuration.cpp @@ -0,0 +1,64 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include + +#include + +#include +#include + +namespace chopper +{ + +void configuration::read_from(std::istream & stream) +{ + std::string line; + std::stringstream config_str; + + while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_config_start) + ; + + assert(line == chopper::prefix::meta_chopper_config_start); + + while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_config_end) + { + assert(line.size() >= 2); + assert(std::string_view{line}.substr(0, 1) == seqan::hibf::prefix::meta_header); + config_str << line.substr(1); // remove seqan::hibf::prefix::meta_header + } + + assert(line == chopper::prefix::meta_chopper_config_end); + + cereal::JSONInputArchive iarchive(config_str); + iarchive(*this); + + hibf_config.read_from(stream); +} + +void configuration::write_to(std::ostream & stream) const +{ + // write json file to temprorary string stream with cereal + std::stringstream config_stream{}; + cereal::JSONOutputArchive output(config_stream); // stream to cout + output(cereal::make_nvp("chopper_config", *this)); + + // write config + stream << chopper::prefix::meta_chopper_config_start << '\n'; + std::string line; + while (std::getline(config_stream, line, '\n')) + stream << seqan::hibf::prefix::meta_header << line << '\n'; + stream << seqan::hibf::prefix::meta_header << "}\n" // last closing bracket isn't written by loop above + << chopper::prefix::meta_chopper_config_end << '\n'; + + hibf_config.write_to(stream); +} + +} // namespace chopper diff --git a/src/input_functor.cpp b/src/input_functor.cpp new file mode 100644 index 00000000..c5fbce6e --- /dev/null +++ b/src/input_functor.cpp @@ -0,0 +1,57 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace chopper +{ + +void input_functor::operator()(size_t const num, seqan::hibf::insert_iterator it) +{ + assert(filenames.size() > num); + if (input_are_precomputed_files) + { + uint64_t hash{}; + char * const hash_data{reinterpret_cast(&hash)}; + std::streamsize const hash_bytes{sizeof(hash)}; + + std::ifstream infile{filenames[num], std::ios::binary}; + + while (infile.read(hash_data, hash_bytes)) + it = hash; + } + else + { + sequence_file_type fin{filenames[num]}; + + seqan3::shape shape = seqan3::ungapped{kmer_size}; + auto minimizer_view = seqan3::views::minimiser_hash(shape, + seqan3::window_size{kmer_size}, + seqan3::seed{adjust_seed(shape.count())}); + + for (auto && [seq] : fin) + { + for (auto hash_value : seq | minimizer_view) + it = hash_value; + } + } +} + +} // namespace chopper diff --git a/src/layout/CMakeLists.txt b/src/layout/CMakeLists.txt new file mode 100644 index 00000000..3faee796 --- /dev/null +++ b/src/layout/CMakeLists.txt @@ -0,0 +1,9 @@ +cmake_minimum_required (VERSION 3.18) + +if (NOT TARGET chopper_layout) + add_library (chopper_layout STATIC determine_best_number_of_technical_bins.cpp execute.cpp hibf_statistics.cpp + ibf_query_cost.cpp input.cpp output.cpp + ) + + target_link_libraries (chopper_layout PUBLIC chopper_shared) +endif () diff --git a/src/chopper_layout.cpp b/src/layout/determine_best_number_of_technical_bins.cpp similarity index 54% rename from src/chopper_layout.cpp rename to src/layout/determine_best_number_of_technical_bins.cpp index 58de3d62..dbf80a8c 100644 --- a/src/chopper_layout.cpp +++ b/src/layout/determine_best_number_of_technical_bins.cpp @@ -5,21 +5,25 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- -#include +#include +#include +#include +#include +#include #include - -#include -#include +#include +#include +#include #include +#include #include -#include -#include -#include +#include -#include #include +#include #include +#include namespace chopper::layout { @@ -98,72 +102,4 @@ determine_best_number_of_technical_bins(chopper::configuration & config) return {best_layout, sketches}; } -int execute(chopper::configuration & config, std::vector const & filenames) -{ - assert(config.hibf_config.number_of_user_bins > 0); - - if (config.hibf_config.disable_estimate_union) - config.hibf_config.disable_rearrangement = true; - - if (config.hibf_config.tmax == 0) // no tmax was set by the user on the command line - { - // Set default as sqrt(#samples). Experiments showed that this is a reasonable default. - if (size_t number_samples = config.hibf_config.number_of_user_bins; - number_samples >= 1ULL << 32) // sqrt is bigger than uint16_t - throw std::invalid_argument{"Too many samples. Please set a tmax (see help via `-hh`)."}; // GCOVR_EXCL_LINE - else - config.hibf_config.tmax = - chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(number_samples)))); - } - else if (config.hibf_config.tmax % 64 != 0) - { - config.hibf_config.tmax = chopper::next_multiple_of_64(config.hibf_config.tmax); - std::cerr << "[CHOPPER LAYOUT WARNING]: Your requested number of technical bins was not a multiple of 64. " - << "Due to the architecture of the HIBF, it will use up space equal to the next multiple of 64 " - << "anyway, so we increased your number of technical bins to " << config.hibf_config.tmax << ".\n"; - } - - seqan::hibf::layout::layout hibf_layout; - std::vector sketches; - - if (config.determine_best_tmax) - { - std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); - } - else - { - std::vector kmer_counts; - - seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); - hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, kmer_counts, sketches); - - if (config.output_verbose_statistics) - { - size_t dummy{}; - chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; - global_stats.hibf_layout = hibf_layout; - global_stats.print_header_to(std::cout); - global_stats.print_summary_to(dummy, std::cout); - } - } - - if (!config.disable_sketch_output) - { - if (!std::filesystem::exists(config.sketch_directory)) - std::filesystem::create_directory(config.sketch_directory); - - assert(filenames.size() == sketches.size()); - for (size_t i = 0; i < filenames.size(); ++i) - sketch::write_sketch_file(filenames[i], sketches[i], config); - } - - // brief Write the output to the layout file. - std::ofstream fout{config.output_filename}; - chopper::layout::write_user_bins_to(filenames, fout); - config.write_to(fout); - hibf_layout.write_to(fout); - - return 0; -} - } // namespace chopper::layout diff --git a/src/layout/execute.cpp b/src/layout/execute.cpp new file mode 100644 index 00000000..48110377 --- /dev/null +++ b/src/layout/execute.cpp @@ -0,0 +1,104 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +namespace chopper::layout +{ + +int execute(chopper::configuration & config, std::vector const & filenames) +{ + assert(config.hibf_config.number_of_user_bins > 0); + + if (config.hibf_config.disable_estimate_union) + config.hibf_config.disable_rearrangement = true; + + if (config.hibf_config.tmax == 0) // no tmax was set by the user on the command line + { + // Set default as sqrt(#samples). Experiments showed that this is a reasonable default. + if (size_t number_samples = config.hibf_config.number_of_user_bins; + number_samples >= 1ULL << 32) // sqrt is bigger than uint16_t + throw std::invalid_argument{"Too many samples. Please set a tmax (see help via `-hh`)."}; // GCOVR_EXCL_LINE + else + config.hibf_config.tmax = + chopper::next_multiple_of_64(static_cast(std::ceil(std::sqrt(number_samples)))); + } + else if (config.hibf_config.tmax % 64 != 0) + { + config.hibf_config.tmax = chopper::next_multiple_of_64(config.hibf_config.tmax); + std::cerr << "[CHOPPER LAYOUT WARNING]: Your requested number of technical bins was not a multiple of 64. " + << "Due to the architecture of the HIBF, it will use up space equal to the next multiple of 64 " + << "anyway, so we increased your number of technical bins to " << config.hibf_config.tmax << ".\n"; + } + + seqan::hibf::layout::layout hibf_layout; + std::vector sketches; + + if (config.determine_best_tmax) + { + std::tie(hibf_layout, sketches) = determine_best_number_of_technical_bins(config); + } + else + { + std::vector kmer_counts; + + seqan::hibf::sketch::compute_sketches(config.hibf_config, kmer_counts, sketches); + hibf_layout = seqan::hibf::layout::compute_layout(config.hibf_config, kmer_counts, sketches); + + if (config.output_verbose_statistics) + { + size_t dummy{}; + chopper::layout::hibf_statistics global_stats{config, sketches, kmer_counts}; + global_stats.hibf_layout = hibf_layout; + global_stats.print_header_to(std::cout); + global_stats.print_summary_to(dummy, std::cout); + } + } + + if (!config.disable_sketch_output) + { + if (!std::filesystem::exists(config.sketch_directory)) + std::filesystem::create_directory(config.sketch_directory); + + assert(filenames.size() == sketches.size()); + for (size_t i = 0; i < filenames.size(); ++i) + sketch::write_sketch_file(filenames[i], sketches[i], config); + } + + // brief Write the output to the layout file. + std::ofstream fout{config.output_filename}; + chopper::layout::write_user_bins_to(filenames, fout); + config.write_to(fout); + hibf_layout.write_to(fout); + + return 0; +} + +} // namespace chopper::layout diff --git a/src/layout/hibf_statistics.cpp b/src/layout/hibf_statistics.cpp new file mode 100644 index 00000000..8b1655a3 --- /dev/null +++ b/src/layout/hibf_statistics.cpp @@ -0,0 +1,587 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +// clang-format off +#include +// clang-format on + +#include +#include +#include +#include +#include +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wrestrict" +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +#include +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic pop +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +namespace chopper::layout +{ + +hibf_statistics::hibf_statistics(configuration const & config_, + std::vector const & sketches_, + std::vector const & kmer_counts) : + config{config_}, + fp_correction{ + seqan::hibf::layout::compute_fpr_correction({.fpr = config_.hibf_config.maximum_false_positive_rate, + .hash_count = config_.hibf_config.number_of_hash_functions, + .t_max = config_.hibf_config.tmax})}, + sketches{sketches_}, + counts{kmer_counts}, + total_kmer_count{std::accumulate(kmer_counts.begin(), kmer_counts.end(), size_t{})} +{} + +void hibf_statistics::finalize() +{ + collect_bins(); + + compute_cardinalities(top_level_ibf); + + compute_total_query_cost(top_level_ibf); + + gather_statistics(top_level_ibf, 0); + + expected_HIBF_query_cost = total_query_cost / total_kmer_count; +} + +//!\brief Prints a column names of the summary to the command line. +void hibf_statistics::print_header_to(std::ostream & stream, bool const verbose) +{ + // print column names explanation in header + stream << "## ### Notation ###\n" + << "## X-IBF = An IBF with X number of bins.\n" + << "## X-HIBF = An HIBF with tmax = X, e.g a maximum of X technical bins on each level.\n"; + + stream << "## ### Column Description ###\n" + "## tmax : The maximum number of technical bin on each level\n" + "## c_tmax : The technical extra cost of querying an tmax-IBF, compared to 64-IBF\n" + "## l_tmax : The estimated query cost for an tmax-HIBF, compared to an 64-HIBF\n" + "## m_tmax : The estimated memory consumption for an tmax-HIBF, compared to an 64-HIBF\n" + "## (l*m)_tmax : Computed by l_tmax * m_tmax\n" + "## size : The expected total size of an tmax-HIBF\n" + << ((verbose) ? "## uncorr_size : The expected size of an tmax-HIBF without FPR correction\n" : ""); + + // print column names + stream << "# tmax" << '\t' << "c_tmax" << '\t' << "l_tmax" << '\t' << "m_tmax" << '\t' << "(l*m)_tmax" << '\t' + << "size"; + + if (verbose) // uncorrected size and add level statistics + { + stream << '\t' << "uncorr_size" << '\t' << "level" << '\t' << "num_ibfs" << '\t' << "level_size" << '\t' + << "level_size_no_corr" << '\t' << "total_num_tbs" << '\t' << "avg_num_tbs" << '\t' + << "split_tb_percentage" << '\t' << "max_split_tb" << '\t' << "avg_split_tb" << '\t' << "max_factor" + << '\t' << "avg_factor"; + } + + stream << '\n'; +} + +void hibf_statistics::print_summary_to(size_t & t_max_64_memory, std::ostream & stream, bool const verbose) +{ + if (summaries.empty()) + finalize(); + + if (t_max_64_memory == 0) + t_max_64_memory = total_hibf_size_in_byte(); + + double const relative_memory_size = total_hibf_size_in_byte() / static_cast(t_max_64_memory); + double const query_time_memory_usage_prod = expected_HIBF_query_cost * relative_memory_size; + + stream << std::fixed << std::setprecision(2); + + std::string level_str, num_ibfs_str, level_size_str, level_size_no_corr_str, total_num_tbs_str, avg_num_tbs_str, + split_tb_percentage_str, max_split_tb_str, avg_split_tb_str, max_factor_str, avg_factor_str; + + size_t total_size{}; + size_t total_size_no_corr{}; + + // go through each level and collect and output the statistics + auto to_string_with_precision = [](auto num) + { + std::stringstream ss; + ss << std::fixed << std::setprecision(2) << num; + return ss.str(); + }; + + for (auto const & [level, s] : summaries) + { + size_t const level_size = std::reduce(s.ibf_mem_size.begin(), s.ibf_mem_size.end()); + size_t const level_size_no_corr = std::reduce(s.ibf_mem_size_no_corr.begin(), s.ibf_mem_size_no_corr.end()); + + total_size += level_size; + total_size_no_corr += level_size_no_corr; + + size_t const total_num_tbs = std::reduce(s.num_tbs.begin(), s.num_tbs.end()); + + size_t const total_num_split_tbs = std::reduce(s.num_split_tbs.begin(), s.num_split_tbs.end()); + double const split_tb_percentage = 100.0 * static_cast(total_num_split_tbs) / total_num_tbs; + + size_t const max_split_bin_span = *std::max_element(s.max_split_tb_span.begin(), s.max_split_tb_span.end()); + +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wrestrict" +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY + + level_str += ":" + to_string_with_precision(level); + num_ibfs_str += ":" + to_string_with_precision(s.num_ibfs); + level_size_str += ":" + to_formatted_BF_size(level_size); + level_size_no_corr_str += ":" + to_formatted_BF_size(level_size_no_corr); + total_num_tbs_str += ":" + to_string_with_precision(total_num_tbs); + avg_num_tbs_str += ":" + to_string_with_precision(total_num_tbs / s.num_ibfs); + split_tb_percentage_str += ":" + to_string_with_precision(split_tb_percentage); + + // if there are no split bins on this level, the following statistics don't make sense + if (max_split_bin_span != 0) + { + size_t const total_num_split_ubs = std::reduce(s.num_split_ubs.begin(), s.num_split_ubs.end()); + double const avg_split_bin = + static_cast(total_num_split_tbs) / static_cast(total_num_split_ubs); + size_t const total_split_tb_kmers = std::reduce(s.split_tb_kmers.begin(), s.split_tb_kmers.end()); + double const avg_factor = + static_cast(std::reduce(s.split_tb_corr_kmers.begin(), s.split_tb_corr_kmers.end())) + / static_cast(total_split_tb_kmers); + + max_split_tb_str += ":" + to_string_with_precision(max_split_bin_span); + avg_split_tb_str += ":" + to_string_with_precision(avg_split_bin); + max_factor_str += ":" + to_string_with_precision((fp_correction)[max_split_bin_span]); + avg_factor_str += ":" + to_string_with_precision(avg_factor); + } + else + { + max_split_tb_str += ":-"; + avg_split_tb_str += ":-"; + max_factor_str += ":-"; + avg_factor_str += ":-"; + } +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic pop +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY + } + + stream << std::fixed << std::setprecision(2); + + stream /* tmax */ << config.hibf_config.tmax + << '\t' + /* c_tmax */ + << chopper::layout::ibf_query_cost::interpolated( + config.hibf_config.tmax, + config.hibf_config.maximum_false_positive_rate) + << '\t' + /* l_tmax */ + << expected_HIBF_query_cost + << '\t' /*relative to a 64 bin IBF*/ + /* m_tmax */ + << relative_memory_size + << '\t' /*relative to the 64 T_Max HIBF*/ + /* (l*m)tmax */ + << query_time_memory_usage_prod + << '\t' + /* corr. size */ + << to_formatted_BF_size(total_size) << ((verbose) ? '\t' : '\n'); + + if (verbose) + { + // uncorrected FPR + stream /*uncorr. size */ << to_formatted_BF_size(total_size_no_corr) << '\t'; + + // per level statistics: + stream /* level */ << level_str + << '\t' + /* num_ibfs */ + << num_ibfs_str + << '\t' + /* level_size */ + << level_size_str + << '\t' + /* level_size_no_corr */ + << level_size_no_corr_str + << '\t' + /* total_num_tbs */ + << total_num_tbs_str + << '\t' + /* avg_num_tbs */ + << avg_num_tbs_str + << '\t' + /* split_tb_percentage */ + << split_tb_percentage_str + << '\t' + /* max_split_tb */ + << max_split_tb_str + << '\t' + /* avg_split_tb */ + << avg_split_tb_str + << '\t' + /* max_factor */ + << max_factor_str + << '\t' + /* avg_factor */ + << avg_factor_str << '\n'; + } +} + +//!\brief Return the total corrected size of the HIBF in bytes +size_t hibf_statistics::total_hibf_size_in_byte() +{ + if (summaries.empty()) + finalize(); + + size_t total_size{}; + + // go through each level and collect the memory sizes + for (auto const & [level, summary] : summaries) + { + (void)level; + + total_size += std::reduce(summary.ibf_mem_size.begin(), summary.ibf_mem_size.end()); + } + + return compute_bin_size(total_size) / 8; +} + +//!\brief Round bytes to the appropriate unit and convert to string with unit. +[[nodiscard]] std::string hibf_statistics::byte_size_to_formatted_str(size_t const bytes) +{ + size_t iterations{}; + size_t integer{bytes}; + + while (integer >> 10u && iterations < 6u) + { + integer >>= 10u; + ++iterations; + } + + // While this is a bit more involved, we can avoid using floating point numbers. + auto first_decimal_position = [&]() + { + assert(iterations > 0u); + size_t decimal{bytes}; + decimal -= integer << (iterations * 10u); // Substract bytes represented by integer, e.g. -5GiB + decimal >>= (iterations - 1u) * 10u; // Shift to next smallest unit, e.g. 800MiB + decimal = decimal * 1000u / 1024u; // Account for using decimal system, i.e. 800MiB != 0.8GiB + size_t const diff{decimal - (decimal / 100u) * 100u}; // We want to round up to 1 decimal position + uint32_t const round_up{diff >= 50u}; + decimal += round_up * 100u - diff; + decimal /= 100u; + return decimal; + }; + + auto formatted_string = [&]() + { + static constexpr int8_t int_to_char_offset{'0'}; // int 0 as char: char{0 + 48} = '0' + size_t const decimal = iterations ? first_decimal_position() : 0u; + assert(decimal <= 10u); + + if (!iterations) // No decimals for Bytes + return std::to_string(integer); + else if (decimal < 10u) // No need to round integer part + return std::to_string(integer) + '.' + static_cast(decimal + int_to_char_offset); + else // Round integer part, e.g., 5.99 MiB should report 6.0 MiB + { + ++integer; + // Check whether rounding results in a change of unit, e.g. 1023.99MiB to 1.0GiB + if (integer >> 10u) + { + ++iterations; + integer >>= 10u; + } + return std::to_string(integer) + ".0"; + } + }; + + std::string result{formatted_string()}; + switch (iterations) + { + case 0: + result += "Bytes"; + break; + case 1: + result += "KiB"; + break; + case 2: + result += "MiB"; + break; + case 3: + result += "GiB"; + break; + case 4: + result += "TiB"; + break; + case 5: + result += "PiB"; + break; + default: + result += "EiB"; + break; + } + + return result; +} + +size_t hibf_statistics::compute_bin_size(size_t const number_of_kmers_to_be_stored) const +{ + return std::ceil(-static_cast(number_of_kmers_to_be_stored * config.hibf_config.number_of_hash_functions) + / std::log(1 + - std::exp(std::log(config.hibf_config.maximum_false_positive_rate) + / config.hibf_config.number_of_hash_functions))); +} + +std::string hibf_statistics::to_formatted_BF_size(size_t const number_of_kmers_to_be_stored) const +{ + size_t const size_in_bytes = compute_bin_size(number_of_kmers_to_be_stored) / 8; + return byte_size_to_formatted_str(size_in_bytes); +} + +void hibf_statistics::collect_bins() +{ + std::vector ibfs(hibf_layout.max_bins.size() + 1); // 0 = top_level + robin_hood::unordered_map, size_t> id_to_pos{}; + + // fill id_to_pos map + id_to_pos[std::vector{}] = 0; + for (size_t i = 0; i < hibf_layout.max_bins.size(); ++i) + id_to_pos[hibf_layout.max_bins[i].previous_TB_indices] = i + 1; + + for (auto const & user_bin_info : hibf_layout.user_bins) + { + std::vector prev{}; + + // add user bin index to previous merged bins + for (size_t i = 0; i < user_bin_info.previous_TB_indices.size(); ++i) + { + auto & ibf = ibfs[id_to_pos.at(prev)]; + auto const target_tb_index = user_bin_info.previous_TB_indices[i]; + + bool found_merged_bin{false}; + for (auto & previous_bins_to_check : ibf.bins) + { + if (previous_bins_to_check.tb_index == target_tb_index) + { + found_merged_bin = true; + previous_bins_to_check.user_bin_indices.push_back(user_bin_info.idx); + ++previous_bins_to_check.num_contained_ubs; + } + } + + if (!found_merged_bin) + { + ibf.bins.emplace_back(hibf_statistics::bin_kind::merged, 1, std::vector{user_bin_info.idx}); + ibf.bins.back().tb_index = target_tb_index; + auto next = prev; + next.push_back(target_tb_index); + ibf.bins.back().child_level_idx = id_to_pos.at(next); + } + prev.push_back(target_tb_index); + } + + // emplace a split bin at last since every user bin is on its lowest level single or split + auto & ibf = ibfs[id_to_pos.at(prev)]; + ibf.bins.emplace_back(hibf_statistics::bin_kind::split, + user_bin_info.number_of_technical_bins, + std::vector{user_bin_info.idx}); + ibf.bins.back().tb_index = user_bin_info.storage_TB_id; + } + + for (auto & ibf : ibfs) + for (auto & bin : ibf.bins) + if (bin.kind == hibf_statistics::bin_kind::merged) + bin.child_level = ibfs[bin.child_level_idx]; + + top_level_ibf = std::move(ibfs[0]); +} + +void hibf_statistics::compute_cardinalities(level & curr_level) +{ + for (bin & current_bin : curr_level.bins) + { + if (current_bin.kind == bin_kind::merged) + { + if (config.hibf_config.disable_estimate_union) + { + size_t sum{}; + for (size_t i = 0; i < current_bin.user_bin_indices.size(); ++i) + sum += counts[current_bin.user_bin_indices[i]]; // TODO should be kmer_counts + current_bin.cardinality = sum; + } + else + { + assert(!current_bin.user_bin_indices.empty()); + seqan::hibf::sketch::hyperloglog hll = sketches[current_bin.user_bin_indices[0]]; + + for (size_t i = 1; i < current_bin.user_bin_indices.size(); ++i) + hll.merge(sketches[current_bin.user_bin_indices[i]]); + + current_bin.cardinality = hll.estimate(); + } + + compute_cardinalities(current_bin.child_level); + } + else if (current_bin.kind == bin_kind::split) // bin_kind::split + { + assert(current_bin.user_bin_indices.size() == 1); + current_bin.cardinality = counts[current_bin.user_bin_indices[0]]; + } + } +} + +void hibf_statistics::compute_total_query_cost(level & curr_level) +{ + // Compute number of technical bins in current level (<= tmax) + size_t number_of_tbs{0}; + size_t level_kmer_count{0}; + size_t index{0}; + std::vector merged_bin_indices{}; + std::vector merged_bin_sketches{}; + + for (bin const & current_bin : curr_level.bins) + { + if (current_bin.kind == bin_kind::merged) + { + ++number_of_tbs; + merged_bin_indices.push_back(index); + + if (!config.hibf_config.disable_estimate_union) + { + // compute merged_bin_sketch + assert(!current_bin.user_bin_indices.empty()); + seqan::hibf::sketch::hyperloglog hll = sketches[current_bin.user_bin_indices[0]]; + + for (size_t i = 1; i < current_bin.user_bin_indices.size(); ++i) + hll.merge(sketches[current_bin.user_bin_indices[i]]); + + merged_bin_sketches.push_back(std::move(hll)); + } + } + else if (current_bin.kind == bin_kind::split) // bin_kind::split + { + number_of_tbs += current_bin.num_spanning_tbs; + level_kmer_count += current_bin.cardinality; + } + ++index; + } + assert(number_of_tbs <= config.hibf_config.tmax); + + // Add cost of querying the current IBF + // (how costly is querying number_of_tbs (e.g. 128 tbs) compared to 64 tbs given the current FPR) + curr_level.current_query_cost += + ibf_query_cost::interpolated(number_of_tbs, config.hibf_config.maximum_false_positive_rate); + + // Add costs of querying the HIBF for each kmer in this level. + total_query_cost += curr_level.current_query_cost * level_kmer_count; + + // update query cost of all merged bins + for (size_t i = 0; i < merged_bin_indices.size(); ++i) + { + auto & current_bin = curr_level.bins[merged_bin_indices[i]]; + + // Pass on cost of querying the current level + current_bin.child_level.current_query_cost = curr_level.current_query_cost; + + // If merged bins share kmers, we need to penalize this + // because querying a kmer will result in multi level look-ups. + if (!config.hibf_config.disable_estimate_union) + { + double const current_estimate = merged_bin_sketches[i].estimate(); + + for (size_t j = i + 1; j < merged_bin_indices.size(); ++j) + { + seqan::hibf::sketch::hyperloglog tmp = + merged_bin_sketches[i]; // copy needed, s.t. current is not modified + double union_estimate = tmp.merge_and_estimate(merged_bin_sketches[j]); + // Jaccard distance estimate + double distance = 2.0 - (current_estimate + merged_bin_sketches[j].estimate()) / union_estimate; + // Since the sizes are estimates, the distance might be slighlty above 1.0 or below 0.0 + // but we need to avoid nagetive numbers + distance = std::min(std::max(distance, 0.0), 1.0); + + current_bin.child_level.current_query_cost += (1.0 - distance); + } + } + } + + // call function recursively for each merged bin + for (size_t i : merged_bin_indices) + compute_total_query_cost(curr_level.bins[i].child_level); +} + +void hibf_statistics::gather_statistics(level const & curr_level, size_t const level_summary_index) +{ + level_summary & summary = summaries[level_summary_index]; + summary.num_ibfs += 1; + + size_t max_cardinality{}, max_cardinality_no_corr{}, num_tbs{}, num_ubs{}, num_split_tbs{}, num_merged_tbs{}, + num_split_ubs{}, num_merged_ubs{}, max_split_tb_span{}, split_tb_kmers{}, max_ubs_in_merged{}, + split_tb_corr_kmers{}; + + for (bin const & current_bin : curr_level.bins) + { + size_t const cardinality_per_split_bin = + (current_bin.cardinality + current_bin.num_spanning_tbs - 1) / current_bin.num_spanning_tbs; // round up + size_t const corrected_cardinality = + std::ceil(cardinality_per_split_bin * (fp_correction)[current_bin.num_spanning_tbs]); + max_cardinality = std::max(max_cardinality, corrected_cardinality); + max_cardinality_no_corr = std::max(max_cardinality_no_corr, cardinality_per_split_bin); + + num_tbs += current_bin.num_spanning_tbs; + num_ubs += current_bin.num_contained_ubs; + + if (current_bin.kind == bin_kind::split) + { + num_split_tbs += current_bin.num_spanning_tbs; + num_split_ubs += 1; + split_tb_corr_kmers += corrected_cardinality * current_bin.num_spanning_tbs; + split_tb_kmers += cardinality_per_split_bin * current_bin.num_spanning_tbs; + max_split_tb_span = std::max(max_split_tb_span, current_bin.num_spanning_tbs); + } + else + { + num_merged_tbs += 1; + num_merged_ubs += current_bin.num_contained_ubs; + max_ubs_in_merged = std::max(max_ubs_in_merged, current_bin.num_contained_ubs); + + gather_statistics(current_bin.child_level, level_summary_index + 1); + } + } + + summary.num_tbs.push_back(num_tbs); + summary.num_ubs.push_back(num_ubs); + + summary.num_split_tbs.push_back(num_split_tbs); + summary.num_merged_tbs.push_back(num_merged_tbs); + + summary.num_split_ubs.push_back(num_split_ubs); + summary.num_merged_ubs.push_back(num_merged_ubs); + + summary.max_split_tb_span.push_back(max_split_tb_span); + summary.split_tb_corr_kmers.push_back(split_tb_corr_kmers); + summary.split_tb_kmers.push_back(split_tb_kmers); + + summary.max_ubs_in_merged.push_back(max_ubs_in_merged); + + summary.ibf_mem_size.push_back(max_cardinality * num_tbs); + summary.ibf_mem_size_no_corr.push_back(max_cardinality_no_corr * num_tbs); +} + +} // namespace chopper::layout diff --git a/src/layout/ibf_query_cost.cpp b/src/layout/ibf_query_cost.cpp new file mode 100644 index 00000000..40f9de93 --- /dev/null +++ b/src/layout/ibf_query_cost.cpp @@ -0,0 +1,84 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace chopper::layout +{ + +double ibf_query_cost::exact(size_t const t_max, double const fpr) +{ + auto it = find_closest_fpr(fpr); + + if (contains(t_max)) + return it->second[position(t_max)]; + else + throw std::invalid_argument("No exact data available for this t_max."); +} + +double ibf_query_cost::interpolated(size_t const t_max, double const fpr) +{ + auto it = find_closest_fpr(fpr); + + if (t_max <= 64u) + { + return it->second[0]; + } + else if (t_max > maximum_t_max) + { + throw std::invalid_argument("No data available for a t_max this large."); + } + else if (contains(t_max)) + { + return it->second[position(t_max)]; + } + else + { + size_t const upper_bound{std::bit_ceil(t_max)}; + size_t const lower_bound{upper_bound >> 1}; + double const upper_value{it->second[position(upper_bound)]}; + double const lower_value{it->second[position(lower_bound)]}; + + double const interpolated_value{lower_value + + (upper_value - lower_value) * (t_max - lower_bound) / lower_bound}; + assert(interpolated_value <= upper_value); + return interpolated_value; + } +} + +std::map>::const_iterator ibf_query_cost::find_closest_fpr(double const fpr) +{ + if (auto it = cost_factors.find(fpr); it != cost_factors.end()) // fpr is found exaclty in map + return it; + + // otherwise search for the closest one in the map + auto lower_it = cost_factors.lower_bound(fpr); + auto upper_it = cost_factors.upper_bound(fpr); + + assert(lower_it != cost_factors.end() || upper_it != cost_factors.end()); + + if (lower_it == cost_factors.end()) + return upper_it; + + if (upper_it == cost_factors.end()) + return lower_it; + + if (std::abs(lower_it->first - fpr) < std::abs(upper_it->first - fpr)) + return lower_it; + else + return upper_it; +} + +} // namespace chopper::layout diff --git a/src/layout/input.cpp b/src/layout/input.cpp new file mode 100644 index 00000000..2b1a7459 --- /dev/null +++ b/src/layout/input.cpp @@ -0,0 +1,80 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +namespace chopper::layout +{ + +std::vector> read_filenames_from(std::istream & stream) +{ + std::vector> filenames{}; + std::string line; + + while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_user_bins_start) + ; + + assert(line == chopper::prefix::meta_chopper_user_bins_start); + +#ifndef NDEBUG + size_t counter{}; +#endif + while (std::getline(stream, line) && line != chopper::prefix::meta_chopper_user_bins_end) + { + assert(line.size() >= 2); + assert(std::string_view{line}.substr(0, 1) == seqan::hibf::prefix::meta_header); + + // @0 file1.fa file2.fa + auto const bin_idx_pos = line.find(' '); + assert(bin_idx_pos != std::string::npos); + +#ifndef NDEBUG + size_t bin_idx{}; + std::from_chars(line.data() + 1, line.data() + bin_idx_pos, bin_idx); + assert(bin_idx == counter++); +#endif + + filenames.emplace_back(); + std::string_view const filename_str{line.begin() + bin_idx_pos + 1, line.end()}; + for (auto const && filename : std::views::split(filename_str, ' ')) + { + auto common_view = std::views::common(filename); + filenames.back().emplace_back(common_view.begin(), common_view.end()); + } + } + + assert(line == chopper::prefix::meta_chopper_user_bins_end); + + return filenames; +} + +std::tuple>, configuration, seqan::hibf::layout::layout> +read_layout_file(std::istream & stream) +{ + std::vector> filenames = chopper::layout::read_filenames_from(stream); + chopper::configuration chopper_config; + chopper_config.read_from(stream); + seqan::hibf::layout::layout hibf_layout{}; + hibf_layout.read_from(stream); + return std::make_tuple(std::move(filenames), std::move(chopper_config), std::move(hibf_layout)); +} + +} // namespace chopper::layout diff --git a/src/layout/output.cpp b/src/layout/output.cpp new file mode 100644 index 00000000..f0436bee --- /dev/null +++ b/src/layout/output.cpp @@ -0,0 +1,31 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include +#include + +#include + +namespace chopper::layout +{ + +void write_user_bins_to(std::vector const & filenames, std::ostream & stream) +{ + stream << chopper::prefix::meta_chopper_user_bins_start << '\n'; + size_t counter{}; + for (auto const & filename : filenames) + stream << seqan::hibf::prefix::meta_header << counter++ << ' ' << filename << '\n'; + stream << chopper::prefix::meta_chopper_user_bins_end << '\n'; +} + +} // namespace chopper::layout diff --git a/src/set_up_parser.cpp b/src/set_up_parser.cpp new file mode 100644 index 00000000..da9147dd --- /dev/null +++ b/src/set_up_parser.cpp @@ -0,0 +1,237 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include + +namespace chopper +{ + +void set_up_parser(sharg::parser & parser, configuration & config) +{ + parser.info.version = "1.0.0"; + parser.info.author = "Svenja Mehringer"; + parser.info.email = "svenja.mehringer@fu-berlin.de"; + parser.info.short_description = "Compute an HIBF layout"; + + parser.info.description.emplace_back("Computes an HIBF layout that tries to minimize the disk space consumption of " + "the resulting index. The space is estimated using a k-mer count per user " + "bin which represents the potential denisity in a technical bin in an " + "interleaved Bloom filter. You can pass the resulting layout to raptor " + "(https://github.com/seqan/raptor) to build the index and " + "conduct queries."); + + parser.info.synopsis.emplace_back( + " --input [--output ] [--threads ] [--kmer ] [--fpr ] [--hash ] " + "[--disable-estimate-union] [--disable-rearrangement]"); + + parser.add_subsection("Main options:"); + // ----------------------------------------------------------------------------------------------------------------- + parser.add_option( + config.data_file, + sharg::config{ + .short_id = '\0', + .long_id = "input", + .description = + "The input must be a file containing paths to sequence data you wish to estimate; one filepath " + "per line. If your file contains auxiliary information (e.g. species IDs), your file must be tab-" + "separated.", + .required = true}); + parser.add_list_item("", "Example file:"); + parser.add_list_item("", "```"); + parser.add_list_item("", "/absolute/path/to/file1.fasta"); + parser.add_list_item("", "/absolute/path/to/file2.fa.gz"); + parser.add_list_item("", "```"); + + parser.add_option( + config.k, + sharg::config{ + .short_id = '\0', + .long_id = "kmer", + .description = + "The k-mer size influences the size estimates of the input. " + "Choosing a k-mer size that is too small for " + "your data will result in files appearing more similar than they really are. Likewise, a large " + "k-mer size might miss out on certain similarities. For DNA sequences, a k-mer size between " + "[16,32] has proven to work well."}); + + parser.add_option( + config.hibf_config.tmax, + sharg::config{ + .short_id = '\0', + .long_id = "tmax", + .description = + "Limits the number of technical bins on each level of the HIBF. Choosing a good tmax is not " + "trivial. The smaller tmax, the more levels the layout needs to represent the data. This results " + "in a higher space consumption of the index. While querying each individual level is cheap, " + "querying many levels might also lead to an increased runtime. " + "A good tmax is usually the square root of the number of user bins/samples rounded to the next " + "multiple of 64. Note that your tmax will always be rounded to the next multiple of 64. " + "At the expense of a longer runtime, you can enable the statistic mode that determines the best " + "tmax for your data set. See the advanced option --determine-best-tmax", + .default_message = "≈sqrt(#samples)", + .advanced = true}); + + parser.add_option( + config.hibf_config.number_of_hash_functions, + sharg::config{.short_id = '\0', + .long_id = "hash", + .description = + "The number of hash functions to use when building the HIBF from the resulting layout. " + "This parameter is needed to correctly estimate the index size when computing the layout."}); + + parser.add_option( + config.hibf_config.maximum_false_positive_rate, + sharg::config{.short_id = '\0', + .long_id = "fpr", + .description = + "The false positive rate you aim for when building the HIBF from the resulting layout. " + "This parameter is needed to correctly estimate the index size when computing the layout."}); + + parser.add_option( + config.output_filename, + sharg::config{.short_id = '\0', .long_id = "output", .description = "A file name for the resulting layout."}); + + parser.add_option( + config.hibf_config.threads, + sharg::config{ + .short_id = '\0', + .long_id = "threads", + .description = + "The number of threads to use. Currently, only merging of sketches is parallelized, so if the flag " + "--disable-rearrangement is set, --threads will have no effect.", + .validator = + sharg::arithmetic_range_validator{static_cast(1), std::numeric_limits::max()}}); + + parser.add_subsection("HyperLogLog Sketches:"); + parser.add_line("To improve the layout, you can estimate the sequence similarities using HyperLogLog sketches."); + + parser.add_flag( + config.hibf_config.disable_estimate_union, + sharg::config{ + .short_id = '\0', + .long_id = "disable-estimate-union", + .description = + "The sketches are used to estimate the sequence similarity among a set of user bins. This will improve " + "the layout computation as merging user bins that do not increase technical bin sizes will be " + "preferred. This may use more RAM and can be disabled in RAM-critical environments. " + "Attention: Also disables rearrangement which depends on union estimations."}); + + parser.add_flag( + config.hibf_config.disable_rearrangement, + sharg::config{ + .short_id = '\0', + .long_id = "disable-rearrangement", + .description = + "As a preprocessing step, rearranging the order of the given user bins based on their sequence " + "similarity may lead to favourable small unions and thus a smaller index. " + "Depending on the number of input samples (user bins), this may be time-consuming and can thus be " + "disabled if a suboptimal layout is sufficient."}); + + parser.add_subsection("Parameter Tweaking:"); + // ----------------------------------------------------------------------------------------------------------------- + parser.add_option( + config.hibf_config.alpha, + sharg::config{ + .short_id = '\0', + .long_id = "alpha", + .description = + "The layout algorithm optimizes the space consumption of the resulting HIBF but currently has no " + "means of optimizing the runtime for querying such an HIBF. In general, the ratio of merged bins " + "and split bins influences the query time because a merged bin always triggers another search on " + "a lower level. To influence this ratio, alpha can be used. The higher alpha, the less merged " + "bins are chosen in the layout. This improves query times but leads to a bigger index.", + .advanced = true}); + + parser.add_option( + config.hibf_config.max_rearrangement_ratio, + sharg::config{ + .short_id = '\0', + .long_id = "max-rearrangement-ratio", + .description = + "When the flag --disable-rearrangement is \\fBnot\\fP set, this option can influence the rearrangement " + "algorithm. The algorithm only rearranges the order of user bins in fixed intervals. The higher " + "--max-rearrangement-ratio, the larger the intervals. This potentially improves the layout, but " + "increases the runtime of the layout algorithm.", + .advanced = true, + .validator = sharg::arithmetic_range_validator{0.0, 1.0}}); + + parser.add_option( + config.hibf_config.sketch_bits, + sharg::config{.short_id = '\0', + .long_id = "sketch-bits", + .description = + "The number of bits the HyperLogLog sketch should use to distribute the values into bins.", + .advanced = true, + .validator = sharg::arithmetic_range_validator{5, 32}}); + + parser.add_subsection("Special options"); + // ----------------------------------------------------------------------------------------------------------------- + parser.add_flag( + config.determine_best_tmax, + sharg::config{ + .short_id = '\0', + .long_id = "determine-best-tmax", + .description = + "When this flag is set, the program will compute multiple layouts for tmax in " + "[64 , 128, 256, ... , tmax] as well as tmax=sqrt(#samples). " + "The layout algorithm itself only optimizes the space consumption. When determining the best " + "layout, we additionally keep track of the average number of queries needed to traverse each " + "layout. This query cost is taken into account when determining the best tmax for your data. " + "Note that the option --tmax serves as upper bound. Once the layout quality starts dropping, the " + "computation is stopped. To run all layout computations, pass the flag --force-all-binnings.", + .advanced = true}); + + parser.add_flag( + config.force_all_binnings, + sharg::config{ + .short_id = '\0', + .long_id = "force-all-binnings", + .description = + "Forces all layouts up to --tmax to be computed, " + "regardless of the layout quality. If the flag --determine-best-tmax is not set, this flag is " + "ignored and has no effect.", + .advanced = true}); + + parser.add_flag( + config.output_verbose_statistics, + sharg::config{.short_id = '\0', + .long_id = "output-verbose-statistics", + .description = + "Enable verbose statistics to be " + "printed to std::cout. If the flag --determine-best-tmax is not set, this flag is ignored " + "and has no effect.", + .hidden = true}); + + parser.add_option( + config.sketch_directory, + sharg::config{ + .long_id = "output-sketches-to", + .description = + "If you supply a directory path with this option, the hyperloglog sketches of your input will be " + "stored in the respective path; one .hll file per input file.", + .default_message = "None", + .advanced = true}); + + parser.add_flag(config.debug, + sharg::config{.short_id = '\0', + .long_id = "debug", + .description = "Enables debug output in layout file.", + .hidden = true}); + + parser.add_section("References"); + parser.add_line("[1] Philippe Flajolet, Éric Fusy, Olivier Gandouet, Frédéric Meunier. HyperLogLog: the analysis " + "of a near-optimal cardinality estimation algorithm. AofA: Analysis of Algorithms, Jun 2007, Juan " + "les Pins, France. pp.137-156. hal-00406166v2, https://doi.org/10.46298/dmtcs.3545"); +} + +} // namespace chopper diff --git a/src/sketch/CMakeLists.txt b/src/sketch/CMakeLists.txt new file mode 100644 index 00000000..8b02e97b --- /dev/null +++ b/src/sketch/CMakeLists.txt @@ -0,0 +1,7 @@ +cmake_minimum_required (VERSION 3.18) + +if (NOT TARGET chopper_sketch) + add_library (chopper_sketch STATIC check_filenames.cpp output.cpp read_data_file.cpp read_hll_files_into.cpp) + + target_link_libraries (chopper_sketch PUBLIC chopper_shared) +endif () diff --git a/src/sketch/check_filenames.cpp b/src/sketch/check_filenames.cpp new file mode 100644 index 00000000..3b89eb48 --- /dev/null +++ b/src/sketch/check_filenames.cpp @@ -0,0 +1,73 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace chopper::sketch +{ + +//!\brief Checks the `filenames` for consistent files, either precomputed or sequence files. +void check_filenames(std::vector const & filenames, configuration & config) +{ + assert(!filenames.empty()); + + auto case_insensitive_string_ends_with = [](std::string_view str, std::string_view suffix) + { + size_t const suffix_length{suffix.size()}; + size_t const str_length{str.size()}; + + if (suffix_length > str_length) + return false; // GCOVR_EXCL_LINE + + for (size_t j = 0, s_start = str_length - suffix_length; j < suffix_length; ++j) + if (std::tolower(str[s_start + j]) != std::tolower(suffix[j])) + return false; + + return true; + }; + + // If the first filename ends in .minimiser we expect all files to end in .minimiser + config.precomputed_files = case_insensitive_string_ends_with(filenames[0], ".minimiser"); + + for (auto const & filename : filenames) + { +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wrestrict" +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY + if (!std::filesystem::exists(filename)) + throw std::invalid_argument{"File " + filename + " does not exist!"}; +#if CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY +# pragma GCC diagnostic pop +#endif // CHOPPER_WORKAROUND_GCC_BOGUS_MEMCPY + + if (config.precomputed_files && !case_insensitive_string_ends_with(filename, ".minimiser")) + { + throw std::invalid_argument{"You are providing precomputed files but the file " + filename + + " does not have the correct file extension (.minimiser)." + " Mixing non-/precomputed files is not allowed."}; + } + else if (!config.precomputed_files && case_insensitive_string_ends_with(filename, ".minimiser")) + { + throw std::invalid_argument{"You are providing sequence files but the file " + filename + + " was identified as a precomputed file (.minimiser)." + " Mixing non-/precomputed files is not allowed."}; + } + } +} + +} // namespace chopper::sketch diff --git a/src/sketch/output.cpp b/src/sketch/output.cpp new file mode 100644 index 00000000..6630ae79 --- /dev/null +++ b/src/sketch/output.cpp @@ -0,0 +1,48 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace chopper::sketch +{ + +void write_count_file_line(std::pair> const & cluster, + uint64_t const weight, + std::ofstream & fout) +{ + auto & [key, filepaths] = cluster; + + for (auto && arr : filepaths | seqan::stl::views::join_with(';')) + fout << arr; + + fout << '\t' << weight << '\t' << key << '\n'; +} + +void write_sketch_file(std::string const & filename, + seqan::hibf::sketch::hyperloglog const & sketch, + configuration const & config) +{ + // For one file in the cluster, the file stem is used with the .hll ending + std::filesystem::path path = config.sketch_directory / std::filesystem::path(filename).stem(); + path += ".hll"; + std::ofstream hll_fout(path, std::ios::binary); + sketch.store(hll_fout); +} + +} // namespace chopper::sketch diff --git a/src/sketch/read_data_file.cpp b/src/sketch/read_data_file.cpp new file mode 100644 index 00000000..ad52e311 --- /dev/null +++ b/src/sketch/read_data_file.cpp @@ -0,0 +1,45 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include + +#include +#include + +namespace chopper::sketch +{ + +void read_data_file(configuration const & config, std::vector & filenames) +{ + std::ifstream fin{config.data_file.string()}; + + if (!fin.good() || !fin.is_open()) + throw std::runtime_error{"Could not open data file " + config.data_file.string() + " for reading."}; + + std::string line; + while (std::getline(fin, line)) + { + auto tab_pos = line.find('\t'); + + if (tab_pos == std::string::npos) + { + std::string const filename{line.begin(), line.end()}; + filenames.push_back(filename); + } + else + { + std::string const filename{line.begin(), line.begin() + tab_pos}; + filenames.push_back(filename); + } + } +} + +} // namespace chopper::sketch diff --git a/src/sketch/read_hll_files_into.cpp b/src/sketch/read_hll_files_into.cpp new file mode 100644 index 00000000..e48bbab0 --- /dev/null +++ b/src/sketch/read_hll_files_into.cpp @@ -0,0 +1,52 @@ +// --------------------------------------------------------------------------------------------------- +// Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +// Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +// shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +// --------------------------------------------------------------------------------------------------- + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace chopper::sketch +{ + +void read_hll_files_into(std::filesystem::path const & hll_dir, + std::vector const & target_filenames, + std::vector & target) +{ + assert(std::filesystem::exists(hll_dir) && !std::filesystem::is_empty(hll_dir)); // checked in chopper_layout + + target.reserve(target_filenames.size()); + + try + { + for (auto const & filename : target_filenames) + { + std::filesystem::path path = hll_dir / std::filesystem::path(filename).stem(); + path += ".hll"; + std::ifstream hll_fin(path, std::ios::binary); + + if (!hll_fin.good()) + throw std::runtime_error{"Could not open file " + path.string()}; + + // the sketch bits will be automatically read from the files + target.emplace_back().load(hll_fin); + } + } + catch (std::runtime_error const & err) + { + std::string const chopper_msg{"[CHOPPER LAYOUT ERROR] Something went wrong trying to read the HyperLogLog" + " sketches from files:\n"}; + throw std::runtime_error{chopper_msg + err.what()}; + } +} + +} // namespace chopper::sketch diff --git a/src/util/CMakeLists.txt b/src/util/CMakeLists.txt new file mode 100644 index 00000000..e38d4a0d --- /dev/null +++ b/src/util/CMakeLists.txt @@ -0,0 +1,13 @@ +# --------------------------------------------------------------------------------------------------- +# Copyright (c) 2006-2023, Knut Reinert & Freie Universität Berlin +# Copyright (c) 2016-2023, Knut Reinert & MPI für molekulare Genetik +# This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License +# shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md +# --------------------------------------------------------------------------------------------------- + +cmake_minimum_required (VERSION 3.18) + +add_executable (measure_hyperloglog EXCLUDE_FROM_ALL measure_hyperloglog.cpp) +target_link_libraries (measure_hyperloglog PUBLIC chopper_lib) + +add_subdirectory (display_layout) diff --git a/src/display_layout/CMakeLists.txt b/src/util/display_layout/CMakeLists.txt similarity index 92% rename from src/display_layout/CMakeLists.txt rename to src/util/display_layout/CMakeLists.txt index 22dbab5f..da0404ff 100644 --- a/src/display_layout/CMakeLists.txt +++ b/src/util/display_layout/CMakeLists.txt @@ -8,4 +8,4 @@ cmake_minimum_required (VERSION 3.18) add_executable (display_layout EXCLUDE_FROM_ALL display_layout.cpp general.cpp process_file.cpp sizes.cpp) -target_link_libraries (display_layout "chopper_interface") +target_link_libraries (display_layout PUBLIC chopper_lib) diff --git a/src/display_layout/display_layout.cpp b/src/util/display_layout/display_layout.cpp similarity index 84% rename from src/display_layout/display_layout.cpp rename to src/util/display_layout/display_layout.cpp index dfc210df..a6be5a15 100644 --- a/src/display_layout/display_layout.cpp +++ b/src/util/display_layout/display_layout.cpp @@ -5,29 +5,16 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include +#include #include -#include +#include +#include +#include -#include - -#include #include #include -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - #include "shared.hpp" void init_shared_meta(sharg::parser & parser) diff --git a/src/display_layout/general.cpp b/src/util/display_layout/general.cpp similarity index 94% rename from src/display_layout/general.cpp rename to src/util/display_layout/general.cpp index f5b042aa..0a6095b4 100644 --- a/src/display_layout/general.cpp +++ b/src/util/display_layout/general.cpp @@ -5,17 +5,24 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include #include - -#include - -#include -#include -#include +#include +#include +#include +#include +#include #include #include +#include #include #include "shared.hpp" @@ -57,7 +64,15 @@ int execute(config const & cfg) if (!layout_file.good() || !layout_file.is_open()) throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; +// https://godbolt.org/z/PeKnxzjn1 +#if defined(__clang__) + auto tuple = chopper::layout::read_layout_file(layout_file); + auto filenames = std::get<0>(tuple); + auto chopper_config = std::get<1>(tuple); + auto hibf_layout = std::get<2>(tuple); +#else auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); +#endif auto const & hibf_config = chopper_config.hibf_config; std::ofstream output_stream{cfg.output}; diff --git a/src/display_layout/process_file.cpp b/src/util/display_layout/process_file.cpp similarity index 95% rename from src/display_layout/process_file.cpp rename to src/util/display_layout/process_file.cpp index 3d08361a..4e20cc40 100644 --- a/src/display_layout/process_file.cpp +++ b/src/util/display_layout/process_file.cpp @@ -5,6 +5,14 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include +#include +#include +#include +#include +#include +#include + #include #include diff --git a/src/display_layout/shared.hpp b/src/util/display_layout/shared.hpp similarity index 100% rename from src/display_layout/shared.hpp rename to src/util/display_layout/shared.hpp diff --git a/src/display_layout/sizes.cpp b/src/util/display_layout/sizes.cpp similarity index 95% rename from src/display_layout/sizes.cpp rename to src/util/display_layout/sizes.cpp index cf877372..210fd2ad 100644 --- a/src/display_layout/sizes.cpp +++ b/src/util/display_layout/sizes.cpp @@ -5,22 +5,32 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include - -#include -#include -#include - -#include #include #include #include -#include +#include +#include #include #include #include @@ -298,7 +308,15 @@ void execute_general_stats(config const & cfg) if (!layout_file.good() || !layout_file.is_open()) throw std::logic_error{"Could not open file " + cfg.input.string() + " for reading"}; +// https://godbolt.org/z/PeKnxzjn1 +#if defined(__clang__) + auto tuple = chopper::layout::read_layout_file(layout_file); + auto filenames = std::get<0>(tuple); + auto chopper_config = std::get<1>(tuple); + auto hibf_layout = std::get<2>(tuple); +#else auto [filenames, chopper_config, hibf_layout] = chopper::layout::read_layout_file(layout_file); +#endif // Prepare configs chopper_config.hibf_config.threads = cfg.threads; diff --git a/src/measure_hyperloglog.cpp b/src/util/measure_hyperloglog.cpp similarity index 98% rename from src/measure_hyperloglog.cpp rename to src/util/measure_hyperloglog.cpp index 7da147ab..5de104a4 100644 --- a/src/measure_hyperloglog.cpp +++ b/src/util/measure_hyperloglog.cpp @@ -5,10 +5,14 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include #include #include #include +#include +#include #include +#include #include #include diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 2bf1321f..0dffd461 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -43,7 +43,13 @@ macro (add_app_test test_filename test_alternative) # Create the test target. add_executable (${target} ${test_filename}) target_link_libraries (${target} "${PROJECT_NAME}_lib" seqan3::seqan3 gtest gtest_main) - target_compile_options (${target} PRIVATE "-Werror") + + # GCC12 and above: Disable warning about std::hardware_destructive_interference_size not being ABI-stable. + if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + target_compile_options (${target} PRIVATE "-Wno-interference-size") + endif () + endif () # Make seqan3::test available for both cli and api tests. target_include_directories (${target} PUBLIC "${SEQAN3_CLONE_DIR}/test/include") @@ -96,6 +102,4 @@ else () add_subdirectory (coverage) endif () -add_dependencies (cli_test measure_hyperloglog) - message (STATUS "${FontBold}You can run `make test` to build and run tests.${FontReset}") diff --git a/test/api/input_functor_test.cpp b/test/api/input_functor_test.cpp index b8e8061c..6e056f6f 100644 --- a/test/api/input_functor_test.cpp +++ b/test/api/input_functor_test.cpp @@ -9,8 +9,6 @@ #include -#include - TEST(execute_test, small_example) { // std::string input_filename = data("small.fa"); diff --git a/test/api/layout/execute_layout_test.cpp b/test/api/layout/execute_layout_test.cpp old mode 100755 new mode 100644 index 18d2b2f4..d163dc3a --- a/test/api/layout/execute_layout_test.cpp +++ b/test/api/layout/execute_layout_test.cpp @@ -7,8 +7,12 @@ #include +#include +#include #include -#include +#include +#include +#include #include #include diff --git a/test/api/layout/execute_with_estimation_test.cpp b/test/api/layout/execute_with_estimation_test.cpp index 145b2cf7..aedbf281 100644 --- a/test/api/layout/execute_with_estimation_test.cpp +++ b/test/api/layout/execute_with_estimation_test.cpp @@ -7,13 +7,19 @@ #include -#include -#include +#include +#include +#include +#include +#include +#include +#include #include #include #include +#include #include #include diff --git a/test/api/layout/hibf_statistics_test.cpp b/test/api/layout/hibf_statistics_test.cpp index d4e84ee6..a5e70312 100644 --- a/test/api/layout/hibf_statistics_test.cpp +++ b/test/api/layout/hibf_statistics_test.cpp @@ -7,8 +7,14 @@ #include +#include +#include +#include #include +#include #include +#include +#include #include #include diff --git a/test/api/layout/ibf_query_cost_test.cpp b/test/api/layout/ibf_query_cost_test.cpp index cdab3db9..ea8fab6f 100644 --- a/test/api/layout/ibf_query_cost_test.cpp +++ b/test/api/layout/ibf_query_cost_test.cpp @@ -7,9 +7,12 @@ #include -#include +#include +#include +#include +#include -#include "../api_test.hpp" +#include TEST(ibf_query_cost_test, exact) { diff --git a/test/api/layout/user_bin_io_test.cpp b/test/api/layout/user_bin_io_test.cpp index a2515931..daf04978 100644 --- a/test/api/layout/user_bin_io_test.cpp +++ b/test/api/layout/user_bin_io_test.cpp @@ -1,10 +1,8 @@ #include // for Test, TestInfo, EXPECT_EQ, Message, TEST, TestPartResult -#include // for size_t -#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... -#include // for allocator, string -#include // for operator<< -#include // for vector +#include // for operator<<, char_traits, basic_ostream, basic_stringstream, strings... +#include // for allocator, string +#include // for vector #include #include diff --git a/test/api/sketch/check_filenames_test.cpp b/test/api/sketch/check_filenames_test.cpp index 37970150..af2df860 100644 --- a/test/api/sketch/check_filenames_test.cpp +++ b/test/api/sketch/check_filenames_test.cpp @@ -7,6 +7,11 @@ #include +#include +#include +#include +#include + #include #include "../api_test.hpp" diff --git a/test/api/sketch/read_data_file_test.cpp b/test/api/sketch/read_data_file_test.cpp index 3cf02b57..ffd92ad7 100644 --- a/test/api/sketch/read_data_file_test.cpp +++ b/test/api/sketch/read_data_file_test.cpp @@ -7,7 +7,9 @@ #include -#include +#include +#include +#include #include #include diff --git a/test/api/sketch/read_hll_files_into_test.cpp b/test/api/sketch/read_hll_files_into_test.cpp index 1147bb5f..74c73601 100644 --- a/test/api/sketch/read_hll_files_into_test.cpp +++ b/test/api/sketch/read_hll_files_into_test.cpp @@ -7,6 +7,14 @@ #include +#include +#include +#include +#include +#include +#include +#include + #include #include diff --git a/test/cli/cli_chopper_basic_test.cpp b/test/cli/cli_chopper_basic_test.cpp index 2cb59d13..52eb08b8 100644 --- a/test/cli/cli_chopper_basic_test.cpp +++ b/test/cli/cli_chopper_basic_test.cpp @@ -5,10 +5,11 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include + +#include #include -#include // range comparisons #include // strings -#include // vectors #include diff --git a/test/cli/cli_chopper_pipeline_test.cpp b/test/cli/cli_chopper_pipeline_test.cpp index 34ef1c92..e44f5700 100644 --- a/test/cli/cli_chopper_pipeline_test.cpp +++ b/test/cli/cli_chopper_pipeline_test.cpp @@ -5,17 +5,13 @@ // shipped with this file and also available at: https://github.com/seqan/chopper/blob/main/LICENSE.md // --------------------------------------------------------------------------------------------------- +#include + +#include #include -#include // range comparisons #include // strings -#include // vectors - -#include -#include #include -#include -#include #include "../api/api_test.hpp" #include "cli_test.hpp" diff --git a/test/header/CMakeLists.txt b/test/header/CMakeLists.txt index 30d11efd..67b77ab2 100644 --- a/test/header/CMakeLists.txt +++ b/test/header/CMakeLists.txt @@ -11,7 +11,7 @@ include (seqan3_test_files) include (seqan3_test_component) add_library ("chopper_header_test_lib" INTERFACE) -target_link_libraries ("chopper_header_test_lib" INTERFACE "${PROJECT_NAME}_interface" gtest gtest_main) +target_link_libraries ("chopper_header_test_lib" INTERFACE "chopper_shared" gtest gtest_main) target_include_directories ("chopper_header_test_lib" INTERFACE "${SEQAN3_TEST_CLONE_DIR}/googletest/include/") # SeqAn3 script adds an include for , which we do not use in Chopper target_include_directories ("chopper_header_test_lib" INTERFACE ./dummy_include) @@ -66,7 +66,7 @@ foreach (header ${header_files}) target_include_directories (${header_target} PRIVATE $ ) - add_dependencies (${header_target} "${PROJECT_NAME}_interface" gtest gtest_main) + add_dependencies (${header_target} "chopper_shared" gtest gtest_main) else () target_link_libraries (${header_target} chopper_header_test_lib) endif ()