Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Piscem-cpp #47

Merged
merged 6 commits into from
Jul 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 142 additions & 2 deletions include/builder/parse_file.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ struct parse_data {
weights::builder weights_builder;
};

void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) {
void parse_file_from_fasta(std::istream& is, parse_data& data, build_configuration const& build_config) {
uint64_t k = build_config.k;
uint64_t m = build_config.m;
uint64_t seed = build_config.seed;
Expand Down Expand Up @@ -205,6 +205,146 @@ void parse_file(std::istream& is, parse_data& data, build_configuration const& b
}
}

void parse_file_from_cfseg(std::istream& is, parse_data& data, build_configuration const& build_config) {
uint64_t k = build_config.k;
uint64_t m = build_config.m;
uint64_t seed = build_config.seed;
uint64_t max_num_kmers_in_super_kmer = k - m + 1;
uint64_t block_size = 2 * k - m; // max_num_kmers_in_super_kmer + k - 1

if (max_num_kmers_in_super_kmer >= (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8))) {
throw std::runtime_error(
"max_num_kmers_in_super_kmer " + std::to_string(max_num_kmers_in_super_kmer) +
" does not fit into " + std::to_string(sizeof(num_kmers_in_super_kmer_uint_type) * 8) +
" bits");
}

/* fit into the wanted number of bits */
assert(max_num_kmers_in_super_kmer < (1ULL << (sizeof(num_kmers_in_super_kmer_uint_type) * 8)));

compact_string_pool::builder builder(k);

std::string sequence;
uint64_t prev_minimizer = constants::invalid_uint64;

uint64_t begin = 0; // begin of parsed super_kmer in sequence
uint64_t end = 0; // end of parsed super_kmer in sequence
uint64_t num_sequences = 0;
uint64_t num_bases = 0;
bool glue = false;

auto append_super_kmer = [&]() {
if (sequence.empty() or prev_minimizer == constants::invalid_uint64 or begin == end) return;

assert(end > begin);
char const* super_kmer = sequence.data() + begin;
uint64_t size = (end - begin) + k - 1;
assert(util::is_valid(super_kmer, size));

/* if num_kmers_in_super_kmer > k - m + 1, then split the super_kmer into blocks */
uint64_t num_kmers_in_super_kmer = end - begin;
uint64_t num_blocks = num_kmers_in_super_kmer / max_num_kmers_in_super_kmer +
(num_kmers_in_super_kmer % max_num_kmers_in_super_kmer != 0);
assert(num_blocks > 0);
for (uint64_t i = 0; i != num_blocks; ++i) {
uint64_t n = block_size;
if (i == num_blocks - 1) n = size;
uint64_t num_kmers_in_block = n - k + 1;
assert(num_kmers_in_block <= max_num_kmers_in_super_kmer);
data.minimizers.emplace_back(prev_minimizer, builder.offset, num_kmers_in_block);
builder.append(super_kmer + i * max_num_kmers_in_super_kmer, n, glue);
if (glue) {
assert(data.minimizers.back().offset > k - 1);
data.minimizers.back().offset -= k - 1;
}
size -= max_num_kmers_in_super_kmer;
glue = true;
}
};

uint64_t seq_len = 0;
uint64_t sum_of_weights = 0;
data.weights_builder.init();

/* intervals of weights */
uint64_t weight_value = constants::invalid_uint64;
uint64_t weight_length = 0;

while (!is.eof()) {
std::getline(is, sequence); // header sequence
auto tsep = sequence.find('\t');
sequence = sequence.substr(tsep + 1);
if (sequence.size() < k) continue;

if (++num_sequences % 100000 == 0) {
std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, "
<< data.num_kmers << " kmers" << std::endl;
}
begin = 0;
end = 0;
glue = false; // start a new piece
prev_minimizer = constants::invalid_uint64;
num_bases += sequence.size();

if (build_config.weighted and seq_len != sequence.size()) {
throw std::runtime_error("file is malformed");
}

while (end != sequence.size() - k + 1) {
char const* kmer = sequence.data() + end;
assert(util::is_valid(kmer, k));
kmer_t uint_kmer = util::string_to_uint_kmer(kmer, k);
uint64_t minimizer = util::compute_minimizer(uint_kmer, k, m, seed);

if (build_config.canonical_parsing) {
kmer_t uint_kmer_rc = util::compute_reverse_complement(uint_kmer, k);
uint64_t minimizer_rc = util::compute_minimizer(uint_kmer_rc, k, m, seed);
minimizer = std::min<uint64_t>(minimizer, minimizer_rc);
}

if (prev_minimizer == constants::invalid_uint64) prev_minimizer = minimizer;
if (minimizer != prev_minimizer) {
append_super_kmer();
begin = end;
prev_minimizer = minimizer;
glue = true;
}

++data.num_kmers;
++end;
}

append_super_kmer();
}

data.minimizers.finalize();
builder.finalize();
builder.build(data.strings);

std::cout << "read " << num_sequences << " sequences, " << num_bases << " bases, "
<< data.num_kmers << " kmers" << std::endl;
std::cout << "num_kmers " << data.num_kmers << std::endl;
std::cout << "num_super_kmers " << data.strings.num_super_kmers() << std::endl;
std::cout << "num_pieces " << data.strings.pieces.size() << " (+"
<< (2.0 * data.strings.pieces.size() * (k - 1)) / data.num_kmers << " [bits/kmer])"
<< std::endl;
assert(data.strings.pieces.size() == num_sequences + 1);

if (build_config.weighted) {
std::cout << "sum_of_weights " << sum_of_weights << std::endl;
data.weights_builder.push_weight_interval(weight_value, weight_length);
data.weights_builder.finalize(data.num_kmers);
}
}

void parse_file(std::istream& is, parse_data& data, build_configuration const& build_config) {
if (build_config.input_type == input_build_type::cfseg) {
parse_file_from_cfseg(is, data, build_config);
}
else {
parse_file_from_fasta(is, data, build_config);
}
}
parse_data parse_file(std::string const& filename, build_configuration const& build_config) {
std::ifstream is(filename.c_str());
if (!is.good()) throw std::runtime_error("error in opening the file '" + filename + "'");
Expand All @@ -220,4 +360,4 @@ parse_data parse_file(std::string const& filename, build_configuration const& bu
return data;
}

} // namespace sshash
} // namespace sshash
32 changes: 30 additions & 2 deletions include/util.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,8 @@ struct neighbourhood {
return good;
}

enum input_build_type {fasta, cfseg};

struct build_configuration {
build_configuration()
: k(31)
Expand All @@ -89,7 +91,9 @@ struct build_configuration {
, weighted(false)
, verbose(true)

, tmp_dirname(constants::default_tmp_dirname) {}
, tmp_dirname(constants::default_tmp_dirname)
, input_type(input_build_type::fasta) {}


uint64_t k; // kmer size
uint64_t m; // minimizer size
Expand All @@ -103,12 +107,14 @@ struct build_configuration {
bool verbose;

std::string tmp_dirname;
input_build_type input_type;

void print() const {
std::cout << "k = " << k << ", m = " << m << ", seed = " << seed << ", l = " << l
<< ", c = " << c
<< ", canonical_parsing = " << (canonical_parsing ? "true" : "false")
<< ", weighted = " << (weighted ? "true" : "false") << std::endl;
<< ", weighted = " << (weighted ? "true" : "false")
<< ", file type = " << (input_type==input_build_type::fasta ? "fasta" : "cfseg") << std::endl;
}
};

Expand Down Expand Up @@ -208,6 +214,28 @@ static void uint_kmer_to_string(kmer_t x, char* str, uint64_t k) {
}
}

static inline uint64_t char_to_uint64(char c) {
switch (c) {
case 'A':
return 0;
case 'C':
return 1;
case 'G':
return 2;
case 'T':
return 3;
}
assert(false);
return -1;
}

[[maybe_unused]] static uint64_t string_to_uint64_no_reverse(char const* str, uint64_t k) {
assert(k <= 32);
uint64_t x = 0;
for (uint64_t i = 0; i != k; ++i) x += char_to_uint64(str[i]) << (2 * i);
return x;
}

[[maybe_unused]] static std::string uint_kmer_to_string(kmer_t x, uint64_t k) {
assert(k <= constants::max_k);
std::string str;
Expand Down
34 changes: 28 additions & 6 deletions src/build.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@ int build(int argc, char** argv) {

/* Required arguments. */
parser.add("input_filename",
"Must be a FASTA file (.fa/fasta extension) compressed with gzip (.gz) or not:\n"
"\t- without duplicate nor invalid kmers\n"
"Must be a FASTA file (.fa/fasta extension) or cf_seg file compressed with gzip (.gz) or not:\n"
"\t- FASTA file should be without duplicate nor invalid kmers\n"
"\t- one DNA sequence per line.\n"
"\tFor example, it could be the de Bruijn graph topology output by BCALM.",
"\tFor example, it could be the de Bruijn graph topology output by BCALM.\n"
"\t- cfseg file is the output file produced by CUTTLEFISH with -f 3\n",
"-i", true);
parser.add("k", "K-mer length (must be <= " + std::to_string(constants::max_k) + ").", "-k",
true);
parser.add("m", "Minimizer length (must be < k).", "-m", true);
parser.add("f", "Format of input (must be fasta | cfseg).", "-f", false);

/* Optional arguments. */
parser.add("seed",
Expand Down Expand Up @@ -52,18 +54,38 @@ int build(int argc, char** argv) {
auto input_filename = parser.get<std::string>("input_filename");
auto k = parser.get<uint64_t>("k");
auto m = parser.get<uint64_t>("m");
auto fmt = parser.get<std::string>("f");

if (fmt != "fasta" && fmt != "cfseg" && fmt != "") {
std::cerr << "unknown input format selected, should be either `fasta` or `cfseg` \n";
std::cerr << "[" << fmt << "]\n";
std::exit(1);
}

dictionary dict;

build_configuration build_config;
build_config.k = k;
build_config.m = m;

if (fmt == "fasta" || fmt == "") {
fmt = "fasta";
build_config.input_type = sshash::input_build_type::fasta;
} else if (fmt == "cfseg") {
build_config.input_type = sshash::input_build_type::cfseg;
}

if (parser.parsed("seed")) build_config.seed = parser.get<uint64_t>("seed");
if (parser.parsed("l")) build_config.l = parser.get<double>("l");
if (parser.parsed("c")) build_config.c = parser.get<double>("c");
build_config.canonical_parsing = parser.get<bool>("canonical_parsing");
build_config.weighted = parser.get<bool>("weighted");

if (build_config.weighted && fmt=="cfseg") {
std::cerr << "weighted index file for cfseg is not supported\n";
std::exit(1);
}

build_config.verbose = parser.get<bool>("verbose");
if (parser.parsed("tmp_dirname")) {
build_config.tmp_dirname = parser.get<std::string>("tmp_dirname");
Expand All @@ -76,8 +98,8 @@ int build(int argc, char** argv) {

bool check = parser.get<bool>("check");
if (check) {
check_correctness_lookup_access(dict, input_filename);
check_correctness_navigational_kmer_query(dict, input_filename);
check_correctness_lookup_access(dict, input_filename, fmt);
check_correctness_navigational_kmer_query(dict, input_filename, fmt);
check_correctness_navigational_contig_query(dict);
if (build_config.weighted) check_correctness_weights(dict, input_filename);
check_correctness_kmer_iterator(dict);
Expand All @@ -97,4 +119,4 @@ int build(int argc, char** argv) {
}

return 0;
}
}
Loading
Loading