Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MISC] Use exact counts in display_layout #207

Merged
merged 2 commits into from
Oct 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions src/util/display_layout/general.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,11 @@ int execute(config const & cfg)
return first_idx < second_idx || (first_idx == second_idx && filesizes[lhs.idx] < filesizes[rhs.idx]);
});

// Estimates the cardinality of one technical bin. For merged bins, user bins will be iteratively added.
seqan::hibf::sketch::hyperloglog sketch{hibf_config.sketch_bits};
// Used to determine the exact cardinality for one technical bin.
robin_hood::unordered_set<uint64_t> current_kmer_set{};
// Stores shared k-mers across user bins of a merged technical bin.
robin_hood::unordered_set<uint64_t> shared_kmers{};
// We can't use `shared_kmers.size() == 0` instead of `shared_kmers_initialised`, because keep_duplicates
// will result in a size of 0 when there are no shared k-mers.
Expand Down Expand Up @@ -140,21 +144,24 @@ int execute(config const & cfg)
// Stats file header
output_stream << "# Layout: " << cfg.input.c_str() << '\n' //
<< "tb_index\t"
<< "size\t"
<< "exact_size\t"
<< "estimated_size\t"
<< "shared_size\t"
<< "ub_count\t"
<< "kind\t"
<< "splits" << '\n';
<< "splits\n";

auto print_result_line = [&]()
{
bool const is_merged{bin_kinds[current_idx] == chopper::layout::hibf_statistics::bin_kind::merged};
size_t const avg_kmer_count = (sketch.estimate() + split_count - 1u) / split_count;
size_t const avg_kmer_count = (current_kmer_set.size() + split_count - 1u) / split_count;
size_t const sketch_estimate = (sketch.estimate() + split_count - 1u) / split_count;

for (size_t i{}, total{split_count}; i < total; ++i)
{
output_stream << current_idx + i << '\t' //
<< avg_kmer_count << '\t' //
<< sketch_estimate << '\t' //
<< shared_kmers.size() << '\t' //
<< ub_count << '\t' //
<< (is_merged ? "merged" : "split") << '\t' //
Expand Down Expand Up @@ -182,11 +189,12 @@ int execute(config const & cfg)
(user_bin.previous_TB_indices.size() == 0) ? user_bin.storage_TB_id : user_bin.previous_TB_indices[0];

// We processed all user bins that belong to the `current_idx`th top-level technical bin.
// Print results and update data.
// Print results, advance the current index, and reset all user bin-specific data.
if (idx != current_idx)
{
print_result_line();
sketch.reset();
current_kmer_set.clear();
shared_kmers.clear();
shared_kmers_initialised = false;
ub_count = 0u;
Expand All @@ -207,7 +215,7 @@ int execute(config const & cfg)
{
++ub_count; // This assumes that each user bin has exactly one associated file. Currently the case.

process_file(filename, current_kmers, sketch, fill_current_kmers, chopper_config.k);
process_file(filename, current_kmer_set, current_kmers, sketch, fill_current_kmers, chopper_config.k);
}

// Compute set intersection: shared_kmers = shared_kmers ∩ current_kmers
Expand Down
33 changes: 33 additions & 0 deletions src/util/display_layout/process_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ using sequence_file_type = seqan3::sequence_file_input<dna4_traits,
seqan3::type_list<seqan3::format_fasta, seqan3::format_fastq>>;

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
Expand All @@ -48,6 +49,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
current_kmers.push_back(hash);
current_kmer_set.insert(hash);
sketch.add(hash);
}
}
Expand All @@ -56,6 +58,7 @@ void process_file(std::string const & filename,
while (infile.read(hash_data, hash_bytes))
{
sketch.add(hash);
current_kmer_set.insert(hash);
}
}
}
Expand All @@ -74,6 +77,7 @@ void process_file(std::string const & filename,
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmers.push_back(hash_value);
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
}
}
Expand All @@ -84,9 +88,38 @@ void process_file(std::string const & filename,
{
for (uint64_t hash_value : seq | minimizer_view)
{
current_kmer_set.insert(hash_value);
sketch.add(hash_value);
}
}
}
}
}

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size)
{
if (filename.ends_with(".minimiser"))
{
uint64_t hash{};
char * const hash_data{reinterpret_cast<char *>(&hash)};
std::streamsize const hash_bytes{sizeof(hash)};

std::ifstream infile{filename, std::ios::binary};

while (infile.read(hash_data, hash_bytes))
current_kmers.push_back(hash);
}
else
{
sequence_file_type fin{filename};

seqan3::shape shape{seqan3::ungapped{kmer_size}};
auto minimizer_view = seqan3::views::minimiser_hash(shape,
seqan3::window_size{kmer_size},
seqan3::seed{chopper::adjust_seed(shape.count())});

for (auto && [seq] : fin)
for (uint64_t hash_value : seq | minimizer_view)
current_kmers.push_back(hash_value);
}
}
4 changes: 4 additions & 0 deletions src/util/display_layout/shared.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <string>
#include <vector>

#include <hibf/contrib/robin_hood.hpp>
#include <hibf/sketch/hyperloglog.hpp>

struct config
Expand All @@ -24,7 +25,10 @@ struct config
void execute_general(config const & cfg);
void execute_sizes(config const & cfg);

void process_file(std::string const & filename, std::vector<uint64_t> & current_kmers, uint8_t const kmer_size);

void process_file(std::string const & filename,
robin_hood::unordered_set<uint64_t> & current_kmer_set,
std::vector<uint64_t> & current_kmers,
seqan::hibf::sketch::hyperloglog & sketch,
bool const fill_current_kmers,
Expand Down
3 changes: 1 addition & 2 deletions src/util/display_layout/sizes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -324,12 +324,11 @@ void execute_general_stats(config const & cfg)
auto input_lambda = [&filenames, &chopper_config](size_t const user_bin_id, seqan::hibf::insert_iterator it)
{
std::vector<uint64_t> current_kmers;
seqan::hibf::sketch::hyperloglog sketch;

if (filenames[user_bin_id].size() > 1)
throw std::runtime_error{"No multi files accepted yet."};

process_file(filenames[user_bin_id][0], current_kmers, sketch, true, chopper_config.k);
process_file(filenames[user_bin_id][0], current_kmers, chopper_config.k);

for (auto const kmer : current_kmers)
it = kmer;
Expand Down