Skip to content

Commit

Permalink
Merge pull request #264 from smehringer/display_layout_fpr_corrected
Browse files Browse the repository at this point in the history
[UTIL,FEATURE] display_layout: Also compute FPR corrected sizes.
  • Loading branch information
eseiler authored Oct 22, 2024
2 parents b781524 + ed0086b commit 60ff406
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 12 deletions.
33 changes: 33 additions & 0 deletions src/util/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
#include <hibf/contrib/robin_hood.hpp>
#include <hibf/contrib/std/chunk_by_view.hpp>
#include <hibf/contrib/std/to.hpp>
#include <hibf/layout/compute_fpr_correction.hpp>
#include <hibf/layout/compute_relaxed_fpr_correction.hpp>
#include <hibf/misc/divide_and_ceil.hpp>
#include <hibf/sketch/hyperloglog.hpp>

#include "shared.hpp"
Expand Down Expand Up @@ -102,6 +105,7 @@ struct record
size_t tb_index{};
size_t exact_size{};
size_t estimated_size{};
size_t corrected_size{};
size_t shared_size{};
size_t ub_count{};
std::string_view kind{};
Expand All @@ -115,6 +119,7 @@ struct record
stream << tb_index + i << '\t' //
<< exact_size << '\t' //
<< estimated_size << '\t' //
<< corrected_size << '\t' //
<< shared_size << '\t' //
<< ub_count << '\t' //
<< kind << '\t' //
Expand All @@ -130,6 +135,7 @@ struct record
<< "tb_index\t"
<< "exact_size\t"
<< "estimated_size\t"
<< "fpr_corrected_size\t"
<< "shared_size\t"
<< "ub_count\t"
<< "kind\t"
Expand Down Expand Up @@ -197,6 +203,20 @@ int execute(config const & cfg)
auto const & hibf_config = chopper_config.hibf_config;

layout_file.close();

// multiplied to cardinality of a merged bin
double const merged_correction = seqan::hibf::layout::compute_relaxed_fpr_correction(
{.fpr = chopper_config.hibf_config.maximum_fpr,
.relaxed_fpr = chopper_config.hibf_config.relaxed_fpr,
.hash_count = chopper_config.hibf_config.number_of_hash_functions});

// contains correction factors for different number of splits
// e.g. splitting a user bin into 5 tbs -> cardinality * split_correction[5] / 5
std::vector<double> const split_correction =
seqan::hibf::layout::compute_fpr_correction({.fpr = chopper_config.hibf_config.maximum_fpr,
.hash_count = chopper_config.hibf_config.number_of_hash_functions,
.t_max = chopper_config.hibf_config.tmax});

std::ofstream output_stream{cfg.output};

if (!output_stream.good() || !output_stream.is_open())
Expand Down Expand Up @@ -330,10 +350,23 @@ int execute(config const & cfg)
size_t const split_count{is_merged ? 1u : hibf_layout.user_bins[chunk[0]].number_of_technical_bins};
size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count;
size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;
size_t const corrected_exact_size = [&]() -> size_t
{
if (is_merged)
{
return std::ceil(avg_kmer_count * merged_correction);
}
else
{
size_t const corrected_content = std::ceil(current_kmer_set.size() * split_correction[split_count]);
return seqan::hibf::divide_and_ceil(corrected_content, split_count);
}
}();

records[tb_index] = record{.tb_index = tb_index,
.exact_size = avg_kmer_count,
.estimated_size = sketch_estimate,
.corrected_size = corrected_exact_size,
.shared_size = shared_kmers.size(),
.ub_count = ub_count,
.kind = (is_merged ? "merged" : "split"),
Expand Down
26 changes: 14 additions & 12 deletions test/cli/util_display_layout_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,13 @@ TEST_F(cli_test, display_layout_general)

ASSERT_TRUE(std::filesystem::exists(general_filename));

std::string expected_general_file{"# Layout: " + layout_filename.string() + "\n" +
R"(tb_index exact_size estimated_size shared_size ub_count kind splits
0 479 483 0 2 merged 1
1 466 466 0 1 split 1
2 287 289 0 1 split 2
3 287 289 0 1 split 0
std::string expected_general_file{
"# Layout: " + layout_filename.string() + "\n" +
R"(tb_index exact_size estimated_size fpr_corrected_size shared_size ub_count kind splits
0 479 483 153 0 2 merged 1
1 466 466 466 0 1 split 1
2 287 289 420 0 1 split 2
3 287 289 420 0 1 split 0
)"};

std::string const actual_file{string_from_file(general_filename)};
Expand Down Expand Up @@ -164,12 +165,13 @@ TEST_F(cli_test, display_layout_general_with_shared_kmers)

ASSERT_TRUE(std::filesystem::exists(general_filename));

std::string expected_general_file{"# Layout: " + layout_filename.string() + "\n" +
R"(tb_index exact_size estimated_size shared_size ub_count kind splits
0 479 483 371 2 merged 1
1 466 466 0 1 split 1
2 287 289 0 1 split 2
3 287 289 0 1 split 0
std::string expected_general_file{
"# Layout: " + layout_filename.string() + "\n" +
R"(tb_index exact_size estimated_size fpr_corrected_size shared_size ub_count kind splits
0 479 483 153 371 2 merged 1
1 466 466 466 0 1 split 1
2 287 289 420 0 1 split 2
3 287 289 420 0 1 split 0
)"};

std::string const actual_file{string_from_file(general_filename)};
Expand Down

0 comments on commit 60ff406

Please sign in to comment.