Skip to content

Commit

Permalink
[MISC] Use exact counts in display_layout (#207)
Browse files Browse the repository at this point in the history
* [MISC] Use exact counts in display_layout

Co-authored-by: Enrico Seiler <[email protected]>

* fix clearing of set

---------

Co-authored-by: Enrico Seiler <[email protected]>
  • Loading branch information
smehringer and eseiler authored Oct 19, 2023
1 parent 074b19f commit 1002f71
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 7 deletions.
18 changes: 13 additions & 5 deletions src/util/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ int execute(config const & cfg)
return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]);
});

// Estimates the cardinality of one technical bin. For merged bins, user bins will be iteratively added.
seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits};
// Used to determine the exact cardinality for one technical bin.
robin_hood::unordered_set<uint64_t> current_kmer_set{};
// Stores shared k-mers across user bins of a merged technical bin.
robin_hood::unordered_set<uint64_t> shared_kmers{};
// We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates
// will result in a size of 0 when there are no shared k-mers.
Expand Down Expand Up @@ -140,21 +144,24 @@ int execute(config const & cfg)
// Stats file header
output_stream << "# Layout: " << cfg.input.c_str() << '\n' //
<< "tb_index\t"
<< "size\t"
<< "exact_size\t"
<< "estimated_size\t"
<< "shared_size\t"
<< "ub_count\t"
<< "kind\t"
<< "splits" << '\n';
<< "splits\n";

auto print_result_line = [&]()
{
bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged};
size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count;
size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count;
size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;

for (size_t i{}, total{split_count}; i < total; ++i)
{
output_stream << current_idx + i << '\t' //
<< avg_kmer_count << '\t' //
<< sketch_estimate << '\t' //
<< shared_kmers.size() << '\t' //
<< ub_count << '\t' //
<< (is_merged ? "merged" : "split") << '\t' //
Expand Down Expand Up @@ -182,11 +189,12 @@ int execute(config const & cfg)
(user_bin.previous_TB_indices.size() == 0) ? user_bin.storage_TB_id : user_bin.previous_TB_indices[0];

// We processed all user bins that belong to the `current_idx`th top-level technical bin.
// Print results and update data.
// Print results, advance the current index, and reset all user bin-specific data.
if (idx != current_idx)
{
print_result_line();
sketch.reset();
current_kmer_set.clear();
shared_kmers.clear();
shared_kmers_initialised = false;
ub_count = 0u;
Expand All @@ -207,7 +215,7 @@ int execute(config const & cfg)
{
++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case.

process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k);
process_file(filename, current_kmer_set, current_kmers, sketch, fill_current_kmers, chopper_config.k);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
33 changes: 33 additions & 0 deletions src/util/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
Expand All @@ -48,6 +49,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
current_kmers.push_back(hash);
current_kmer_set.insert(hash);
sketch.add(hash);
}
}
Expand All @@ -56,6 +58,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash);
current_kmer_set.insert(hash);
}
}
}
Expand All @@ -74,6 +77,7 @@ void process_file(std::string const & filename,
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmers.push_back(hash_value);
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
}
}
Expand All @@ -84,9 +88,38 @@ void process_file(std::string const & filename,
{
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
}
}
}
}
}

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size)
{
if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filename, std::ios::binary};

while (infile.read(hash_data, hash_bytes))
current_kmers.push_back(hash);
}
else
{
sequence_file_type fin{filename};

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::seed{chopper::adjust_seed(shape.count())});

for (auto && [seq] : fin)
for (uint64_t hash_value : seq | minimizer_view)
current_kmers.push_back(hash_value);
}
}
4 changes: 4 additions & 0 deletions src/util/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <string>
#include <vector>

#include <hibf/contrib/robin_hood.hpp>
#include <hibf/sketch/hyperloglog.hpp>

struct config
Expand All @@ -24,7 +25,10 @@ struct config
void execute_general(config const & cfg);
void execute_sizes(config const & cfg);

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size);

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
Expand Down
3 changes: 1 addition & 2 deletions src/util/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,12 +324,11 @@ void execute_general_stats(config const & cfg)
auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
std::vector<uint64_t> current_kmers;
seqan::hibf::sketch::hyperloglog sketch;

if (filenames[user_bin_id].size() > 1)
throw std::runtime_error{"No multi files accepted yet."};

process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k);
process_file(filenames[user_bin_id][0], current_kmers, chopper_config.k);

for (auto const kmer : current_kmers)
it = kmer;
Expand Down

0 comments on commit 1002f71

Please sign in to comment.