Skip to content

Commit

Permalink
print bits/kmer using close-form formula
Browse files Browse the repository at this point in the history
  • Loading branch information
jermp committed Jul 9, 2024
1 parent 6904074 commit 05d6127
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 2 deletions.
7 changes: 6 additions & 1 deletion include/builder/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ void dictionary::build(std::string const& filename, build_configuration const& b
throw std::runtime_error("m must be less <= " + std::to_string(constants::max_m) +
" but got m = " + std::to_string(build_config.m));
}
if (build_config.m > build_config.k) throw std::runtime_error("m must be < k");
if (build_config.m > build_config.k) throw std::runtime_error("m must be <= k");
if (build_config.l >= constants::max_l) {
throw std::runtime_error("l must be < " + std::to_string(constants::max_l));
}
Expand Down Expand Up @@ -74,6 +74,11 @@ void dictionary::build(std::string const& filename, build_configuration const& b
mm::file_source<minimizer_tuple> input(data.minimizers.get_minimizers_filename(),
mm::advice::sequential);
minimizers_tuples_iterator iterator(input.data(), input.data() + input.size());

if (build_config.verbose) {
std::cout << "num_minimizers " << data.minimizers.num_minimizers() << std::endl;
}

m_minimizers.build(iterator, data.minimizers.num_minimizers(), build_config);
input.close();
}
Expand Down
43 changes: 42 additions & 1 deletion include/info.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,51 @@ uint64_t skew_index::print_info() const {
return num_kmers_in_skew_index;
}

double bits_per_kmer_formula(uint64_t k, /* kmer length */
uint64_t m, /* minimizer length */
uint64_t n, /* num. kmers */
uint64_t M) /* num. strings in SPSS */
{
/*
Caveats:
1. we assume an alphabet of size 4
2. this assumes a random minimizer scheme, so num. super-kmers is ~ 2n/(k-m+2)
3. we neglect lower order terms and skew index space
4. no canonical parsing
*/

assert(k > 0);
assert(k >= m);

const uint64_t N = n + M * (k - 1); // num. symbols in SPSS

// double num_minimizers = (2.0 * n) / (k - m + 2); // not distinct, hence num. of super-kmers
// std::cout << "num_minimizers = " << num_minimizers << std::endl;
// std::cout << "minimizers: " << (3.0 * num_minimizers) / n << " [bits/kmer]" << std::endl;
// std::cout << "pieces: " << (M * (2.0 + std::ceil(std::log2(static_cast<double>(N) / M)))) / n
// << " [bits/kmer]" << std::endl;
// std::cout << "num_super_kmers_before_bucket: " << (2.0 * num_minimizers) / n << " [bits/kmer]
// "
// << std::endl;
// std::cout << "offsets: " << (std::ceil(std::log2(N)) * num_minimizers) / n << " [bits/kmer]"
// << std::endl;
// std::cout << "strings: " << (2.0 * N) / n << " [bits/kmer]" << std::endl;

double num_bits = 2 * n * (1.0 + (5.0 + std::ceil(std::log2(N))) / (k - m + 2)) +
M * (2 * k + std::ceil(std::log2(static_cast<double>(n) / M + k - 1)));

return num_bits / n;
}

void dictionary::print_space_breakdown() const {
const uint64_t num_bytes = (num_bits() + 7) / 8;
std::cout << "total index size: " << num_bytes << " [B] -- "
<< essentials::convert(num_bytes, essentials::MB) << " [MB]" << '\n';
std::cout << "SPACE BREAKDOWN:\n";
std::cout << " minimizers: " << static_cast<double>(m_minimizers.num_bits()) / size()
<< " [bits/kmer]\n";
<< " [bits/kmer] ("
<< static_cast<double>(m_minimizers.num_bits()) / m_minimizers.size()
<< " [bits/key])\n";
std::cout << " pieces: " << static_cast<double>(m_buckets.pieces.num_bits()) / size()
<< " [bits/kmer]\n";
std::cout << " num_super_kmers_before_bucket: "
Expand All @@ -47,6 +85,9 @@ void dictionary::print_space_breakdown() const {
std::cout << " --------------\n";
std::cout << " total: " << static_cast<double>(num_bits()) / size() << " [bits/kmer]"
<< std::endl;

std::cout << " Close-form formula: " << bits_per_kmer_formula(k(), m(), size(), num_contigs())
<< " [bits/kmer]" << std::endl;
}

void dictionary::print_info() const {
Expand Down

0 comments on commit 05d6127

Please sign in to comment.