diff --git a/CMakeLists.txt b/CMakeLists.txt index a0138d29..f1f7d73f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -336,6 +336,9 @@ add_library(smoothxg_objs OBJECT src/tempfile.cpp deps/xxHash/xxhash.c src/xg.cpp + src/dna.cpp + src/pos.cpp + src/seqindex.cpp src/chain.cpp src/prep.cpp src/cleanup.cpp diff --git a/src/dna.cpp b/src/dna.cpp new file mode 100644 index 00000000..56dfd334 --- /dev/null +++ b/src/dna.cpp @@ -0,0 +1,64 @@ +#include "dna.hpp" + +namespace smoothxg { + +static const char dna_complement[256] = {'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 8 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 16 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 24 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 32 + 'N', 'N', 'N', '$', '#', 'N', 'N', 'N', // 40 GCSA stop/start characters + 'N', 'N', 'N', 'N', 'N', '-', 'N', 'N', // 48 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 56 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 64 + 'N', 'T', 'V', 'G', 'H', 'N', 'N', 'C', // 72 + 'D', 'N', 'N', 'M', 'N', 'K', 'N', 'N', // 80 + 'N', 'Q', 'Y', 'W', 'A', 'A', 'B', 'S', // 88 + 'N', 'R', 'N', 'N', 'N', 'N', 'N', 'N', // 96 + 'N', 't', 'v', 'g', 'h', 'N', 'N', 'c', // 104 + 'd', 'N', 'N', 'm', 'N', 'k', 'n', 'N', // 112 + 'N', 'q', 'y', 'w', 'a', 'a', 'b', 's', // 120 + 'N', 'r', 'N', 'N', 'N', 'N', 'N', 'N', // 128 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 136 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 144 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 152 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 160 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 168 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 176 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 184 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 192 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 200 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 208 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 216 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 224 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 232 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 240 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N', // 248 + 'N', 'N', 'N', 'N', 'N', 'N', 'N', 'N'};// 256 + +char dna_reverse_complement(const char& c) { + return dna_complement[c]; +} + +std::string dna_reverse_complement(const std::string& seq) { + std::string rc; + rc.assign(seq.rbegin(), seq.rend()); + for (auto& c : rc) { + c = dna_complement[c]; + } + return rc; +} + +void dna_reverse_complement_in_place(std::string& seq) { + size_t swap_size = seq.size() / 2; + for (size_t i = 0, j = seq.size() - 1; i < swap_size; i++, j--) { + char tmp = seq[i]; + seq[i] = dna_complement[seq[j]]; + seq[j] = dna_complement[tmp]; + } + + if (seq.size() % 2) { + seq[swap_size] = dna_complement[seq[swap_size]]; + } +} + +} diff --git a/src/dna.hpp b/src/dna.hpp new file mode 100644 index 00000000..b8d68d26 --- /dev/null +++ b/src/dna.hpp @@ -0,0 +1,14 @@ +#ifndef DNA_HPP_INCLUDED +#define DNA_HPP_INCLUDED + +#include + +namespace smoothxg { + +char dna_reverse_complement(const char& c); +std::string dna_reverse_complement(const std::string& seq); +void dna_reverse_complement_in_place(std::string& seq); + +} + +#endif diff --git a/src/main.cpp b/src/main.cpp index a0f6373a..4c6e359e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -26,6 +26,7 @@ #include #include "include/smoothxg_git_version.hpp" #include +#include "seqindex.hpp" // If the SMOOTHXG_GIT_VERSION doesn't exist at all, define a placeholder #ifndef SMOOTHXG_GIT_VERSION @@ -35,6 +36,18 @@ using namespace std; using namespace xg; +std::unique_ptr load_block( + std::vector &block_graphs, + uint64_t idx) { + std::string data; + zstdutil::DecompressString(*block_graphs[idx], data); + stringstream ss; + ss << data; + ss.seekg(0,std::ios_base::beg); + auto x = std::make_unique(); + x->deserialize_members(ss); + return x; +} int main(int argc, char **argv) { args::ArgumentParser parser("smoothxg: collinear block finder and graph consensus generator\n" + std::string(SMOOTHXG_GIT_VERSION)); @@ -363,201 +376,665 @@ int main(int argc, char **argv) { const uint64_t target_poa_length = (uint64_t)smoothxg::handy_parameter(target_poa_lengths[current_iter], 4000); const uint64_t max_poa_length = _max_poa_length ? (uint64_t)smoothxg::handy_parameter(args::get(_max_poa_length), (2 * target_poa_length)) : 2 * target_poa_length; const uint64_t max_block_weight = _max_block_weight ? (uint64_t)smoothxg::handy_parameter(args::get(_max_block_weight), (target_poa_length * n_haps)) : target_poa_length * n_haps; - auto graph = std::make_unique(); - const uint64_t current_iter_1_based = current_iter + 1; - std::stringstream smoothxg_iter_stream; - smoothxg_iter_stream << "[smoothxg::" << "(" << current_iter_1_based << "-" << num_iterations << ")"; - const std::string smoothxg_iter = smoothxg_iter_stream.str(); + + const uint64_t current_iter_1_based = current_iter + 1; + std::stringstream smoothxg_iter_stream; + smoothxg_iter_stream << "[smoothxg::" << "(" << current_iter_1_based << "-" << num_iterations << ")"; + const std::string smoothxg_iter = smoothxg_iter_stream.str(); std::cerr << smoothxg_iter << "::main] loading graph" << std::endl; - // The first iteration can start from the input XG index - if (current_iter == 0 && !args::get(xg_in).empty()) { - std::ifstream in(args::get(xg_in)); - graph->deserialize(in); - } else { - std::string gfa_in_name; - if (!args::get(no_prep)) { - if (args::get(tmp_base).empty()) { - gfa_in_name = path_input_gfa + ".prep." + std::to_string(current_iter) + ".gfa"; + // mapping from path fragments to block graphs + auto _path_mapping_tmp = temp_file::create(); + auto path_mapping_ptr = std::make_unique>(_path_mapping_tmp); + auto& path_mapping = *path_mapping_ptr; + path_mapping.open_writer(); + + auto _block_graphs = std::make_unique>(); + auto& block_graphs = *_block_graphs; // get a reference to the contained vector + + // mapping from block to consensus ids + std::vector consensus_mapping; + + std::vector> merged_block_id_intervals_tree_vector; + std::vector block_id_ranges_vector; + ska::flat_hash_set inverted_merged_block_id_intervals_ranks; // IITree can't store inverted intervals + + std::vector is_block_in_a_merged_group; + + // We add consensus paths only during the last iteration + const std::string consensus_base_name = (current_iter == num_iterations - 1) && add_consensus ? consensus_path_prefix : ""; + + ska::flat_hash_map> path_handle_2_name_and_length; + + auto seqidx_ptr = std::make_unique(); + auto& seqidx = *seqidx_ptr; + + uint64_t block_count; + + { + auto graph = std::make_unique(); + + // The first iteration can start from the input XG index + if (current_iter == 0 && !args::get(xg_in).empty()) { + std::ifstream in(args::get(xg_in)); + graph->deserialize(in); + } else { + std::string gfa_in_name; + if (!args::get(no_prep)) { + if (args::get(tmp_base).empty()) { + gfa_in_name = path_input_gfa + ".prep." + std::to_string(current_iter) + ".gfa"; + } else { + const std::string filename = filesystem::path(path_input_gfa).filename(); + gfa_in_name = args::get(tmp_base) + '/' + filename + ".prep." + std::to_string(current_iter) + ".gfa"; + } + std::cerr << smoothxg_iter << "::main] prepping graph for smoothing" << std::endl; + smoothxg::prep(args::get(gfa_in), gfa_in_name, node_chop, + term_updates, true, temp_file::get_dir() + '/', n_threads, + smoothxg_iter); } else { - const std::string filename = filesystem::path(path_input_gfa).filename(); - gfa_in_name = args::get(tmp_base) + '/' + filename + ".prep." + std::to_string(current_iter) + ".gfa"; + gfa_in_name = path_input_gfa; + } + std::cerr << smoothxg_iter << "::main] building xg index" << std::endl; + graph->from_gfa(gfa_in_name, false, temp_file::get_dir() + '/'); + if (!args::get(keep_temp) && !args::get(no_prep)) { + std::remove(gfa_in_name.c_str()); } - std::cerr << smoothxg_iter << "::main] prepping graph for smoothing" << std::endl; - smoothxg::prep(args::get(gfa_in), gfa_in_name, node_chop, - term_updates, true, temp_file::get_dir() + '/', n_threads, - smoothxg_iter); - } else { - gfa_in_name = path_input_gfa; } - std::cerr << smoothxg_iter << "::main] building xg index" << std::endl; - graph->from_gfa(gfa_in_name, false, temp_file::get_dir() + '/'); - if (!args::get(keep_temp) && !args::get(no_prep)) { - std::remove(gfa_in_name.c_str()); + + auto *blockset = new smoothxg::blockset_t(); + smoothxg::smoothable_blocks(*graph, + *blockset, + max_block_weight, + target_poa_length, + max_block_jump, + max_edge_jump, + order_paths_from_longest, + num_threads, + smoothxg_iter); + + const uint64_t min_autocorr_z = 5; + const uint64_t autocorr_stride = 50; + + smoothxg::break_blocks(*graph, + blockset, + block_length_ratio_min, + min_length_mash_based_clustering, + block_group_identity, + block_group_est_identity, + kmer_size, + min_dedup_depth_for_block_splitting, + min_dedup_depth_for_mash_clustering, + max_poa_length, + min_copy_length, + max_copy_length, + min_autocorr_z, + autocorr_stride, + order_paths_from_longest, + true, + n_threads, + #ifdef POA_DEBUG + args::get(write_block_to_split_fastas), + #endif + smoothxg_iter); + + // build the path_step_rank_ranges -> index_in_blocks_vector + // flat_hash_map using SKA: KEY: path_name, VALUE: sorted interval_tree using cgranges https://github.com/lh3/cgranges: + // we collect path_step_rank_ranges and the identifier of an interval is the index of a block in the blocks vector + //ska::flat_hash_map> happy_tree_friends = smoothxg::generate_path_nuc_range_block_index(blocks, graph); + + const bool local_alignment = !args::get(change_alignment_mode); + + std::string maf_header; + // We emit the MAF file only during the last iteration + if ((current_iter == num_iterations - 1) && write_msa_in_maf_format) { + basic_string filename; + if (!args::get(xg_in).empty()) { + size_t found = args::get(xg_in).find_last_of("/\\"); + filename = (args::get(xg_in).substr(found + 1)); + } else if (!args::get(gfa_in).empty()) { + size_t found = args::get(gfa_in).find_last_of("/\\"); + filename = (args::get(gfa_in).substr(found + 1)); + } + + maf_header += "##maf version=1\n"; + maf_header += "# smoothxg\n"; + maf_header += "# input=" + filename + " sequences=" + std::to_string(graph->get_path_count()) + "\n"; + + // Merge mode + maf_header += "# merge_blocks="; + maf_header += (args::get(merge_blocks) ? "true" : "false"); + maf_header += " contiguous_path_jaccard=" + std::to_string(contiguous_path_jaccard) + "\n"; + + // POA + maf_header += "# POA="; + maf_header += (args::get(use_abpoa) ? "abPOA" : "SPOA"); + maf_header += " alignment_mode="; + maf_header += (local_alignment ? "local" : "global"); + maf_header += " order_paths=from_"; + maf_header += (order_paths_from_longest ? "longest" : "shortest"); + maf_header += "\n"; + + // create_blocks + maf_header += "# max_block_weight=" + std::to_string(max_block_weight) + + " max_block_jump=" + std::to_string(max_block_jump) + + " max_edge_jump=" + std::to_string(max_edge_jump) + "\n"; + + // break_blocks + maf_header += "# max_poa_length=" + std::to_string(max_poa_length) + + " min_copy_length=" + std::to_string(min_copy_length) + + " max_copy_length=" + std::to_string(max_copy_length) + + " min_autocorr_z=" + std::to_string(min_autocorr_z) + + " autocorr_stride=" + std::to_string(autocorr_stride) + "\n"; + + // split_blocks + maf_header += "# block_group_identity=" + std::to_string(block_group_identity) + + " block_group_estimated_identity=" + std::to_string(block_group_est_identity) + + " min_length_mash_based_clustering=" + std::to_string(min_length_mash_based_clustering) + + " min_dedup_depth_for_mash_clustering=" + + std::to_string(min_dedup_depth_for_mash_clustering) + + " kmer_size=" + std::to_string(_kmer_size) + "\n"; } - } - auto *blockset = new smoothxg::blockset_t(); - smoothxg::smoothable_blocks(*graph, - *blockset, - max_block_weight, - target_poa_length, - max_block_jump, - max_edge_jump, - order_paths_from_longest, - num_threads, - smoothxg_iter); - - const uint64_t min_autocorr_z = 5; - const uint64_t autocorr_stride = 50; - - smoothxg::break_blocks(*graph, - blockset, - block_length_ratio_min, - min_length_mash_based_clustering, - block_group_identity, - block_group_est_identity, - kmer_size, - min_dedup_depth_for_block_splitting, - min_dedup_depth_for_mash_clustering, - max_poa_length, - min_copy_length, - max_copy_length, - min_autocorr_z, - autocorr_stride, - order_paths_from_longest, - true, - n_threads, + block_count = blockset->size(); + + if ((current_iter == num_iterations - 1) && add_consensus) { + consensus_mapping.resize(block_count); + + if (merge_blocks) { + is_block_in_a_merged_group.resize(block_count); + } + } + + _block_graphs->resize(block_count, nullptr); + + smoothxg::smooth_and_lace(*graph, + blockset, + path_mapping, + block_graphs, + consensus_mapping, + merged_block_id_intervals_tree_vector, + block_id_ranges_vector, + inverted_merged_block_id_intervals_ranks, + is_block_in_a_merged_group, + poa_m, + poa_n, + poa_g, + poa_e, + poa_q, + poa_c, + args::get(adaptive_poa_params), + kmer_size, + poa_padding_fraction, + max_block_depth_for_padding_more, + local_alignment, + n_threads, + n_poa_threads, + (current_iter == num_iterations - 1) ? args::get(write_msa_in_maf_format) : "", maf_header, + args::get(merge_blocks), args::get(_preserve_unmerged_consensus), + contiguous_path_jaccard, + args::get(use_abpoa), + consensus_base_name, + consensus_path_names, #ifdef POA_DEBUG - args::get(write_block_to_split_fastas), + write_block_fastas, #endif - smoothxg_iter); - - // build the path_step_rank_ranges -> index_in_blocks_vector - // flat_hash_map using SKA: KEY: path_name, VALUE: sorted interval_tree using cgranges https://github.com/lh3/cgranges: - // we collect path_step_rank_ranges and the identifier of an interval is the index of a block in the blocks vector - //ska::flat_hash_map> happy_tree_friends = smoothxg::generate_path_nuc_range_block_index(blocks, graph); - - const bool local_alignment = !args::get(change_alignment_mode); - - std::string maf_header; - // We emit the MAF file only during the last iteration - if ((current_iter == num_iterations - 1) && write_msa_in_maf_format) { - basic_string filename; - if (!args::get(xg_in).empty()) { - size_t found = args::get(xg_in).find_last_of("/\\"); - filename = (args::get(xg_in).substr(found + 1)); - } else if (!args::get(gfa_in).empty()) { - size_t found = args::get(gfa_in).find_last_of("/\\"); - filename = (args::get(gfa_in).substr(found + 1)); - } + max_merged_groups_in_memory, + smoothxg_iter); - maf_header += "##maf version=1\n"; - maf_header += "# smoothxg\n"; - maf_header += "# input=" + filename + " sequences=" + std::to_string(graph->get_path_count()) + "\n"; - - // Merge mode - maf_header += "# merge_blocks="; - maf_header += (args::get(merge_blocks) ? "true" : "false"); - maf_header += " contiguous_path_jaccard=" + std::to_string(contiguous_path_jaccard) + "\n"; - - // POA - maf_header += "# POA="; - maf_header += (args::get(use_abpoa) ? "abPOA" : "SPOA"); - maf_header += " alignment_mode="; - maf_header += (local_alignment ? "local" : "global"); - maf_header += " order_paths=from_"; - maf_header += (order_paths_from_longest ? "longest" : "shortest"); - maf_header += "\n"; - - // create_blocks - maf_header += "# max_block_weight=" + std::to_string(max_block_weight) + - " max_block_jump=" + std::to_string(max_block_jump) + - " max_edge_jump=" + std::to_string(max_edge_jump) + "\n"; - - // break_blocks - maf_header += "# max_poa_length=" + std::to_string(max_poa_length) + - " min_copy_length=" + std::to_string(min_copy_length) + - " max_copy_length=" + std::to_string(max_copy_length) + - " min_autocorr_z=" + std::to_string(min_autocorr_z) + - " autocorr_stride=" + std::to_string(autocorr_stride) + "\n"; - - // split_blocks - maf_header += "# block_group_identity=" + std::to_string(block_group_identity) + - " block_group_estimated_identity=" + std::to_string(block_group_est_identity) + - " min_length_mash_based_clustering=" + std::to_string(min_length_mash_based_clustering) + - " min_dedup_depth_for_mash_clustering=" + - std::to_string(min_dedup_depth_for_mash_clustering) + - " kmer_size=" + std::to_string(_kmer_size) + "\n"; - } + // Save path names, lengths and sequences from the input graph as we will need them later + graph->for_each_path_handle([&](const path_handle_t& path) { + path_handle_2_name_and_length[path] = std::make_pair(graph->get_path_name(path), graph->get_path_length(path)); + }); + + std::cerr << smoothxg_iter << "::smooth_and_lace] indexing sequences" << std::endl; + seqidx.build_index(*graph); + seqidx.save(); + delete blockset; + } + // Here "graph" is deallocated, avoiding keeping it in memory together with the smoothed graph we are going to create + + std::cerr << smoothxg_iter << "::smooth_and_lace] sorting path fragments" << std::endl; + // sort the path range mappings by path handle id, then start position + // this will allow us to walk through them in order + /* + ips4o::parallel::sort( + path_mapping.begin(), path_mapping.end(), + [](const path_position_range_t &a, const path_position_range_t &b) { + auto &a_id = as_integer(get_base_path(a)); + auto &b_id = as_integer(get_base_path(b)); + return (a_id < b_id || a_id == b_id && get_start_pos(a) < get_start_pos(b)); + }); + */ + path_mapping.index(n_threads); + std::cerr << smoothxg_iter << "::smooth_and_lace] sorted " << path_mapping.size() << " path fragments" << std::endl; + + // build the sequence and edges into the output graph + auto* smoothed = new odgi::graph_t(); + const uint64_t sample_rate = block_count > 12000000 ? 4 : (block_count > 6000000 ? 2 : 0); { - auto smoothed = smoothxg::smooth_and_lace(*graph, - blockset, - poa_m, - poa_n, - poa_g, - poa_e, - poa_q, - poa_c, - args::get(adaptive_poa_params), - kmer_size, - poa_padding_fraction, - max_block_depth_for_padding_more, - local_alignment, - n_threads, - n_poa_threads, - (current_iter == num_iterations - 1) ? args::get(write_msa_in_maf_format) : "", maf_header, - args::get(merge_blocks), args::get(_preserve_unmerged_consensus), - contiguous_path_jaccard, - args::get(use_abpoa), - // We add consensus paths only during the last iteration - (current_iter == num_iterations - 1) && add_consensus ? consensus_path_prefix : "", - consensus_path_names, -#ifdef POA_DEBUG - write_block_fastas, -#endif - max_merged_groups_in_memory, - smoothxg_iter); - - std::cerr << smoothxg_iter << "::main] unchopping smoothed graph" << std::endl; - odgi::algorithms::unchop(*smoothed, n_threads, true); - - uint64_t smoothed_nodes = 0; - uint64_t smoothed_length = 0; - smoothed->for_each_handle( - [&](const handle_t &h) { - ++smoothed_nodes; - smoothed_length += smoothed->get_length(h); + std::stringstream load_graphs_banner; + load_graphs_banner << smoothxg_iter << "::smooth_and_lace] loading " << block_count << " graph blocks:"; + progress_meter::ProgressMeter load_graphs_progress(block_count, load_graphs_banner.str()); + std::vector> graphs(block_count); + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t idx = 0; idx < block_count; ++idx) { + if (sample_rate == 0 || 0 == smoothxg::modulo(idx, sample_rate)) { + std::string data; + zstdutil::DecompressString(*block_graphs[idx], data); + stringstream ss; + ss << data; + ss.seekg(0,std::ios_base::beg); + graphs[idx] = std::make_unique(); + graphs[idx]->deserialize_members(ss); + delete block_graphs[idx]; + } + load_graphs_progress.increment(1); + } + load_graphs_progress.finish(); + + std::stringstream add_graph_banner; + add_graph_banner << smoothxg_iter << "::smooth_and_lace] adding nodes and edges from " << block_count << " graphs:"; + progress_meter::ProgressMeter add_graph_progress(block_count, add_graph_banner.str()); + + std::vector id_mapping; + for (uint64_t idx = 0; idx < block_count; ++idx) { + uint64_t id_trans = smoothed->get_node_count(); + id_mapping.push_back(id_trans); // record the id translation + + std::unique_ptr temp_block; + // temp_block unique pointer to ensure we don't lose ownership if we load a block using the load_block function. + // The , (comma) operator in the ternary condition ensures the temporary loaded block gets assigned to temp_block before being used. + // This approach keeps the block in scope and properly managed during the loop iteration. + auto& block = (sample_rate == 0 || 0 == smoothxg::modulo(idx, sample_rate)) ? graphs[idx] : (temp_block = load_block(block_graphs, idx), temp_block); + if (block->get_node_count() == 0) { + continue; + } + block->for_each_handle([&](const handle_t &h) { + smoothed->create_handle(block->get_sequence(h)); + }); + block->for_each_edge([&](const edge_t &e) { + smoothed->create_edge( + smoothed->get_handle(id_trans + block->get_id(e.first)), + smoothed->get_handle(id_trans + block->get_id(e.second))); + }); + add_graph_progress.increment(1); + } + add_graph_progress.finish(); + + // Prepare the data needed to embed the paths in parallel + ska::flat_hash_map> path_handle_2_start_and_end_in_path_mapping; + { + path_handle_t prec_path = smoothxg::get_base_path(path_mapping.read_value(0)); + uint64_t prec_i = 0; + for (uint64_t i = 1; i < path_mapping.size(); ++i) { + path_handle_t current_path = smoothxg::get_base_path(path_mapping.read_value(i)); + if (current_path != prec_path) { + // Create path handles in the output graph not in parallel to preserve their order + smoothed->create_path_handle(path_handle_2_name_and_length[prec_path].first); + + // Record the start and end of the path in the path_mapping + path_handle_2_start_and_end_in_path_mapping[prec_path] = std::make_pair(prec_i, i-1); + + prec_path = current_path; + prec_i = i; + } + } + if (path_mapping.size() == 1 || prec_i != path_mapping.size() - 1) { + smoothed->create_path_handle(path_handle_2_name_and_length[prec_path].first); + path_handle_2_start_and_end_in_path_mapping[prec_path] = std::make_pair(prec_i, path_mapping.size() - 1); + } + } + + // then for each path, ensure that it's embedded in the graph by walking through + // its block segments in order and linking them up in the output graph + // do it in parallel + std::stringstream lace_banner; + lace_banner << smoothxg_iter << "::smooth_and_lace] embedding " << path_mapping.size() << " path fragments:"; + progress_meter::ProgressMeter lace_progress(path_mapping.size(), lace_banner.str()); + auto it = path_handle_2_start_and_end_in_path_mapping.begin(); + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t i = 0; i < path_handle_2_start_and_end_in_path_mapping.size(); ++i) { + auto local_it = std::next(it, i); + + //int thread_num = omp_get_thread_num(); + //#pragma omp critical + //std::cout << "Thread " << thread_num << " handling: " << local_it->second.first << " - " << local_it->second.second << "\n"; + + smoothxg::path_position_range_t pos_range = path_mapping.read_value(local_it->second.first); + step_handle_t last_step = {0, 0}; + bool first = true; + uint64_t last_end_pos = 0; + + const path_handle_t smoothed_path = smoothed->get_path_handle(path_handle_2_name_and_length[smoothxg::get_base_path(pos_range)].first); + + // walk the path from start to end + for (uint64_t j = local_it->second.first; j <= local_it->second.second; ++j) { + pos_range = path_mapping.read_value(j); + + // if we find a segment that's not included in any block, we'll add + // it to the final graph and link it in to do so, we detect a gap in + // length, collect the sequence in the gap and add it to the graph + // as a node then add it as a traversal to the path + if (smoothxg::get_start_pos(pos_range) - last_end_pos > 0) { + assert(false); // assert that we've included all sequence in blocks + } + // write the path steps into the graph using the id translation + auto block_id = smoothxg::get_block_id(pos_range); + std::unique_ptr temp_block; + auto& block = (sample_rate == 0 || 0 == smoothxg::modulo(block_id, sample_rate)) ? graphs[block_id] : (temp_block = load_block(block_graphs, block_id), temp_block); + auto id_trans = id_mapping.at(block_id); + block->for_each_step_in_path( + smoothxg::get_target_path(pos_range), [&](const step_handle_t &step) { + handle_t h = block->get_handle_of_step(step); + handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, + block->get_is_reverse(h)); + smoothed->append_step(smoothed_path, t); + if (first) { + first = false; + // create edge between last and curr + if (as_integers(last_step)[0] != 0) { + smoothed->create_edge( + smoothed->get_handle_of_step(last_step), t); + } + } + }); + last_step = smoothed->path_back(smoothed_path); + last_end_pos = smoothxg::get_end_pos(pos_range); + + lace_progress.increment(1); + } + + // now add in any final sequence in the path + // and add it to the path, add the edge + if (path_handle_2_name_and_length[smoothxg::get_base_path(pos_range)].second > last_end_pos) { + assert(false); // assert that we've included all sequence in the blocks + } + } + lace_progress.finish(); + + path_mapping.close_reader(); + std::remove(_path_mapping_tmp.c_str()); + path_mapping_ptr.reset(nullptr); + + // now verify that smoothed has paths that are equal to the base graph + // and that all the paths are fully embedded in the graph + { + std::vector paths; // for parallel iteration + smoothed->for_each_path_handle([&](const path_handle_t &path) { + paths.push_back(path); + }); + + std::stringstream validate_banner; + validate_banner << smoothxg_iter << "::smooth_and_lace] validating " << paths.size() << " path sequences:"; + progress_meter::ProgressMeter validate_progress(paths.size(), validate_banner.str()); + + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t i = 0; i < paths.size(); ++i) { + auto path = paths[i]; + + std::string orig_seq, smoothed_seq; + orig_seq = seqidx.seq(smoothed->get_path_name(path)); + smoothed->for_each_step_in_path(path, [&](const step_handle_t &step) { + smoothed_seq.append(smoothed->get_sequence(smoothed->get_handle_of_step(step))); }); - std::cerr << smoothxg_iter << "::main] smoothed graph length " << smoothed_length << "bp " << "in " - << smoothed_nodes << " nodes" << std::endl; - - std::string path_smoothed_gfa; - if (current_iter < num_iterations - 1) { - consensus_path_names.clear(); // We need this only at the last iteration - const std::string patent_dir = args::get(tmp_base).empty() ? - filesystem::path(path_input_gfa).parent_path().string() : - args::get(tmp_base); - if (patent_dir == "") { - path_smoothed_gfa = prefix + ".smooth." + std::to_string(current_iter) + ".gfa"; - } else { - path_smoothed_gfa = patent_dir + "/" + prefix + ".smooth." + std::to_string(current_iter) + ".gfa"; - } - } else { - path_smoothed_gfa = smoothed_out_gfa; + if (orig_seq != smoothed_seq) { + std::cerr << smoothxg_iter << "] error! path " + << smoothed->get_path_name(path) + << " was corrupted in the smoothed graph" << std::endl + << "original\t" << orig_seq << std::endl + << "smoothed\t" << smoothed_seq << std::endl; + exit(1); + } + + validate_progress.increment(1); + } + validate_progress.finish(); } - std::cerr << smoothxg_iter << "::main] writing smoothed graph to " << path_smoothed_gfa << std::endl; - ofstream out(path_smoothed_gfa.c_str()); - smoothed->to_gfa(out); - out.close(); - delete smoothed; + if (!consensus_mapping.empty()) { + std::cerr << smoothxg_iter << "::smooth_and_lace] sorting consensus" << std::endl; + + // consensus path and connections + + // by definition, the consensus paths are embedded in our blocks, which simplifies + // things we'll still need to add a new path for each consensus path - path_input_gfa = path_smoothed_gfa; + // flag the blocks that we should include unmerged + atomicbitvector::atomic_bv_t exclude_unmerged_consensus(block_count); + + // Is there something merged? + if (!merged_block_id_intervals_tree_vector.empty()) { + #pragma omp parallel for schedule(dynamic,1) + for (auto& merged_block_id_intervals_tree : merged_block_id_intervals_tree_vector) { + merged_block_id_intervals_tree.index(); + } + + if (!args::get(_preserve_unmerged_consensus)) { + std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: removing redundant single consensus" << std::endl; + + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { + if (is_block_in_a_merged_group[id]) { + exclude_unmerged_consensus.set(id); + } + } + } + } + + // all raw consensus paths + std::vector consensus_paths(block_count); + + // Unmerged consensus sequences + // First, create the path handles + std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: creating path handles" << std::endl; + for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { + //for (auto &pos_range : consensus_mapping) { + if (!exclude_unmerged_consensus.test(id)) { + std::unique_ptr temp_block; + auto& block = (sample_rate == 0 || 0 == smoothxg::modulo(id, sample_rate)) ? graphs[id] : (temp_block = load_block(block_graphs, id), temp_block); + consensus_paths[id] = smoothed->create_path_handle(block->get_path_name(consensus_mapping[id])); + } // else skip the embedding of the single consensus sequences + } + + // Next, add the steps + std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: creating step handles" << std::endl; + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { + //for(auto& pos_range : consensus_mapping){ + if (exclude_unmerged_consensus.test(id)) { + continue; // skip the embedding for the single consensus sequence + } + std::unique_ptr temp_block; + auto& block = (sample_rate == 0 || 0 == smoothxg::modulo(id, sample_rate)) ? graphs[id] : (temp_block = load_block(block_graphs, id), temp_block); + path_handle_t smoothed_path = consensus_paths[id]; + auto &id_trans = id_mapping[id]; + block->for_each_step_in_path(consensus_mapping[id], [&](const step_handle_t &step) { + handle_t h = block->get_handle_of_step(step); + handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, block->get_is_reverse(h)); + smoothed->append_step(smoothed_path, t); + // nb: by definition of our construction of smoothed + // the consensus paths should have all their edges embedded + }); + } + + // Merged consensus sequences + if (!merged_block_id_intervals_tree_vector.empty()) { + // First, create the path handles + std::cerr << smoothxg_iter << "::smooth_and_lace] embedding merged consensus: creating path handles" << std::endl; + std::vector merged_consensus_paths; + + for (auto &block_id_ranges : block_id_ranges_vector) { + assert(!smoothed->has_path(consensus_base_name + block_id_ranges)); + merged_consensus_paths.push_back( + smoothed->create_path_handle(consensus_base_name + block_id_ranges) + ); + } + + // Next, add the steps + std::cerr << smoothxg_iter << "::smooth_and_lace] embedding merged consensus: creating step handles" << std::endl; + std::mutex consensus_path_is_merged_mutex; + ska::flat_hash_set consensus_path_is_merged; + assert(merged_block_id_intervals_tree_vector.size() == block_id_ranges_vector.size()); + + #pragma omp parallel for schedule(dynamic,1) + for (uint64_t i = 0; i < merged_block_id_intervals_tree_vector.size(); ++i) { + auto &merged_block_id_intervals_tree = merged_block_id_intervals_tree_vector[i]; + + bool inverted_intervals = inverted_merged_block_id_intervals_ranks.count(i) != 0; + path_handle_t consensus_path = merged_consensus_paths[i]; + + std::vector merged_block_id_intervals; + merged_block_id_intervals_tree.overlap(0, block_count, merged_block_id_intervals); + + uint64_t start_interval = 0; + uint64_t end_interval = merged_block_id_intervals.size() - 1; + int8_t step_interval = 1; + int8_t step = 1; + if (inverted_intervals) { + start_interval = merged_block_id_intervals.size() - 1; + end_interval = 0; + step_interval = -1; + step = -1; + } + + for (uint64_t j = start_interval; j != (end_interval + step_interval); j += step_interval) { + auto &merged_block_id_interval_idx = merged_block_id_intervals[j]; + + uint64_t start = merged_block_id_intervals_tree.start(merged_block_id_interval_idx); + uint64_t end = merged_block_id_intervals_tree.end(merged_block_id_interval_idx) - 1; + if (inverted_intervals){ + uint64_t tmp = start; + start = end; + end = tmp; + + /*{ + std::lock_guard guard(consensus_path_is_merged_mutex); + + std::cerr << i << ": start-end " << start << "-" << end < guard(consensus_path_is_merged_mutex); + + consensus_path_is_merged.insert(as_integer(consensus_paths[block_id])); + } + + std::unique_ptr temp_block; + auto& block = (sample_rate == 0 || 0 == smoothxg::modulo(block_id, sample_rate)) ? graphs[block_id] : (temp_block = load_block(block_graphs, block_id), temp_block); + auto& id_trans = id_mapping[block_id]; + block->for_each_step_in_path( + consensus_mapping[block_id], + [&](const step_handle_t &step) { + handle_t h = block->get_handle_of_step(step); + handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, block->get_is_reverse(h)); + smoothed->append_step(consensus_path, t); + }); + } + } + + clear_string(block_id_ranges_vector[i]); + } + + // now for each consensus path that's not been merged, and for each merged consensus path... + // record our path handles for later use in consensus graph generation + + consensus_paths.erase( + std::remove_if( + consensus_paths.begin(), consensus_paths.end(), + [&consensus_path_is_merged](const path_handle_t& path) { + return consensus_path_is_merged.count(as_integer(path)) > 0; + }), + consensus_paths.end()); + + consensus_paths.reserve( + consensus_paths.size() + + std::distance(merged_consensus_paths.begin(), + merged_consensus_paths.end())); + consensus_paths.insert( + consensus_paths.end(), + merged_consensus_paths.begin(), + merged_consensus_paths.end()); + + } + + // todo: validate the consensus paths as well + + consensus_path_names.reserve(consensus_paths.size()); + for (auto &path : consensus_paths) { + consensus_path_names.push_back(smoothed->get_path_name(path)); + } + } + } + // clear blocks that were not loaded and cleared before + for (uint64_t idx = 0; idx < block_count; ++idx) { + if (!(sample_rate == 0 || 0 == smoothxg::modulo(idx, sample_rate))) { + delete block_graphs[idx]; + } } + _block_graphs.reset(nullptr); + + { + std::stringstream embed_banner; + embed_banner << smoothxg_iter << "::smooth_and_lace] walking edges in " + << smoothed->get_path_count() << " paths:"; + progress_meter::ProgressMeter embed_progress(smoothed->get_path_count(), embed_banner.str()); + // embed all paths in the graph to ensure validity + smoothed->for_each_path_handle( + [&](const path_handle_t& path) { + handle_t last; + step_handle_t begin_step = smoothed->path_begin(path); + smoothed->for_each_step_in_path( + path, + [&](const step_handle_t &step) { + handle_t h = smoothed->get_handle_of_step(step); + if (step != begin_step) { + smoothed->create_edge(last, h); + } + last = h; + }); + embed_progress.increment(1); + }); + embed_progress.finish(); + } + + std::cerr << smoothxg_iter << "::main] unchopping smoothed graph" << std::endl; + odgi::algorithms::unchop(*smoothed, n_threads, true); + + uint64_t smoothed_nodes = 0; + uint64_t smoothed_length = 0; + smoothed->for_each_handle( + [&](const handle_t &h) { + ++smoothed_nodes; + smoothed_length += smoothed->get_length(h); + }); + std::cerr << smoothxg_iter << "::main] smoothed graph length " << smoothed_length << "bp " << "in " + << smoothed_nodes << " nodes" << std::endl; + + std::string path_smoothed_gfa; + if (current_iter < num_iterations - 1) { + consensus_path_names.clear(); // We need this only at the last iteration + const std::string patent_dir = args::get(tmp_base).empty() ? + filesystem::path(path_input_gfa).parent_path().string() : + args::get(tmp_base); + if (patent_dir == "") { + path_smoothed_gfa = prefix + ".smooth." + std::to_string(current_iter) + ".gfa"; + } else { + path_smoothed_gfa = patent_dir + "/" + prefix + ".smooth." + std::to_string(current_iter) + ".gfa"; + } + } else { + path_smoothed_gfa = smoothed_out_gfa; + } + + std::cerr << smoothxg_iter << "::main] writing smoothed graph to " << path_smoothed_gfa << std::endl; + ofstream out(path_smoothed_gfa.c_str()); + smoothed->to_gfa(out); + out.close(); + delete smoothed; - delete blockset; + path_input_gfa = path_smoothed_gfa; } // do we need to write the consensus path names? diff --git a/src/pos.cpp b/src/pos.cpp new file mode 100644 index 00000000..cd942d8f --- /dev/null +++ b/src/pos.cpp @@ -0,0 +1,72 @@ +#include "pos.hpp" + +namespace smoothxg { + +bool operator<(const aln_pos_t& a, const aln_pos_t& b) { + return a.pos < b.pos && a.aln_length < b.aln_length; +} + +bool operator==(const aln_pos_t& a, const aln_pos_t& b) { + return a.pos == b.pos && a.aln_length == b.aln_length; +} + +pos_t make_pos_t(uint64_t offset, bool is_rev) { + // top bit is reserved for is_rev flag + // the rest is our offset in the input sequence vector + uint64_t rev_mask = (uint64_t)1; // the bit mask + pos_t pos = offset<<1; + // https://graphics.stanford.edu/~seander/bithacks.html#ConditionalSetOrClearBitsWithoutBranching + pos = (pos & ~rev_mask) | (-is_rev & rev_mask); + return pos; +} + +uint64_t offset(const pos_t& pos) { + //return (pos & ~(uint64_t)1) >> 1; + return pos >> 1; +} + +bool is_rev(const pos_t& pos) { + return pos & (uint64_t)1; +} + +void incr_pos(pos_t& pos) { + if (is_rev(pos)) { + pos -= 2; + } else { + pos += 2; + } +} + +void incr_pos(pos_t& pos, size_t by) { + if (is_rev(pos)) { + pos -= 2*by; + } else { + pos += 2*by; + } +} + +void decr_pos(pos_t& pos) { + if (!is_rev(pos)) { + pos -= 2; + } else { + pos += 2; + } +} + +void decr_pos(pos_t& pos, size_t by) { + if (!is_rev(pos)) { + pos -= 2*by; + } else { + pos += 2*by; + } +} + +pos_t rev_pos_t(const pos_t& pos) { + return make_pos_t(offset(pos), !is_rev(pos)); +} + +std::string pos_to_string(const pos_t& pos) { + return std::to_string(offset(pos)) + (is_rev(pos)?"-":"+"); +} + +} diff --git a/src/pos.hpp b/src/pos.hpp new file mode 100644 index 00000000..a890f99c --- /dev/null +++ b/src/pos.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include +#include + +namespace smoothxg { + +typedef uint64_t pos_t; +struct aln_pos_t { pos_t pos; uint64_t aln_length; }; +bool operator<(const aln_pos_t& a, const aln_pos_t& b); +bool operator==(const aln_pos_t& a, const aln_pos_t& b); +pos_t make_pos_t(uint64_t offset, bool is_rev); +uint64_t offset(const pos_t& pos); +bool is_rev(const pos_t& pos); +void incr_pos(pos_t& pos); +void incr_pos(pos_t& pos, size_t by); +void decr_pos(pos_t& pos); +void decr_pos(pos_t& pos, size_t by); +pos_t rev_pos_t(const pos_t& pos); +std::string pos_to_string(const pos_t& pos); + +} diff --git a/src/seqindex.cpp b/src/seqindex.cpp new file mode 100644 index 00000000..c7f7b176 --- /dev/null +++ b/src/seqindex.cpp @@ -0,0 +1,286 @@ +#include "seqindex.hpp" +#include "tempfile.hpp" +#include "odgi/odgi.hpp" +#include "blocks.hpp" +#include "xg.hpp" + + +namespace smoothxg { + +// load a FASTA or FASTQ file into a file with a name index mapping name -> offset and indexed with a CSA +// provide queries over this index that let us extract particular positions and subsequences +void seqindex_t::set_base_filename() { + seqfilename = temp_file::create("smoothxg-sqq"); + seqidxfile = temp_file::create("smoothxg-sqi"); + seqnamefile = temp_file::create("smoothxg-sqi.seqnames.tmp"); // used during construction +} + +void seqindex_t::build_index(const xg::XG &graph) { + set_base_filename(); + + std::ofstream seqnames(seqnamefile.c_str()); + std::ofstream seqout(seqfilename.c_str()); + std::vector seqname_offset; + std::vector seq_offset; + + size_t seq_bytes_written = 0; + size_t seq_names_bytes_written = 0; + + // read the path sequences + graph.for_each_path_handle([&](const path_handle_t &path) { + std::string seq; + graph.for_each_step_in_path(path,[&](const step_handle_t &step) { + seq.append(graph.get_sequence(graph.get_handle_of_step(step))); + }); + + seqname_offset.push_back(seq_names_bytes_written); + seq_offset.push_back(seq_bytes_written); + + std::string seq_name = ">" + graph.get_path_name(path); + seqnames << seq_name << " "; + seq_names_bytes_written += seq_name.size() + 1; + + // force the sequence to be upper-case + //std::transform(seq.begin(), seq.end(), seq.begin(), [](char c) { return std::toupper(c); }); + seqout << seq; + // record where the sequence starts + seq_bytes_written += seq.size(); + }); + + // add the last value so we can get sequence length for the last sequence and name + seq_offset.push_back(seq_bytes_written); + seqname_offset.push_back(seq_names_bytes_written); + seqnames.close(); + seqout.close(); + // save the count of sequences + seq_count = seqname_offset.size()-1; + // mark the seq name starts vector, adding a terminating mark + sdsl::bit_vector seq_name_starts(seqname_offset.back()+1); + for (size_t i = 0; i < seqname_offset.size(); ++i) { + seq_name_starts[seqname_offset[i]] = 1; + } + // build the name index + construct(seq_name_csa, seqnamefile, 1); + + // check if there are duplicated sequence names + std::ifstream seqnames_in(seqnamefile.c_str()); + bool duplicated_ids = false; + std::string line; + while (std::getline(seqnames_in, line, ' ')) { + //std::cout << line << " (" << locate(seq_name_csa, line).size() << ")" << std::endl; + std::string query = line + " "; + if(locate(seq_name_csa, query).size() > 1){ + duplicated_ids = true; + break; + } + } + seqnames_in.close(); + + // destroy the file + std::remove(seqnamefile.c_str()); + + if (duplicated_ids){ + std::cerr << "[smoothxg] ERROR: input sequences have duplicated IDs." << std::endl; + exit(1); + } + + // build the rest of the index + sdsl::util::assign(seq_name_cbv, sdsl::sd_vector<>(seq_name_starts)); + sdsl::util::assign(seq_name_cbv_rank, sdsl::sd_vector<>::rank_1_type(&seq_name_cbv)); + sdsl::util::assign(seq_name_cbv_select, sdsl::sd_vector<>::select_1_type(&seq_name_cbv)); + // mark the seq begin vector, adding a terminating mark + sdsl::bit_vector seq_begin_bv(seq_offset.back()+1); + for (size_t i = 0; i < seq_offset.size(); ++i) { + seq_begin_bv[seq_offset[i]] = 1; + } + sdsl::util::assign(seq_begin_cbv, sdsl::sd_vector<>(seq_begin_bv)); + sdsl::util::assign(seq_begin_cbv_rank, sdsl::sd_vector<>::rank_1_type(&seq_begin_cbv)); + sdsl::util::assign(seq_begin_cbv_select, sdsl::sd_vector<>::select_1_type(&seq_begin_cbv)); + //std::cerr << seq_offset_civ << std::endl; + // validate + // look up each sequence by name +} + +size_t seqindex_t::save(sdsl::structure_tree_node* s, const std::string& name) { + //assert(seq_name_csa.size() && seq_name_cbv.size() && seq_offset_civ.size()); + sdsl::structure_tree_node* child = sdsl::structure_tree::add_child(s, name, sdsl::util::class_name(*this)); + // open the sdsl index + std::ofstream out(seqidxfile.c_str()); + size_t written = 0; + out << "seqidx"; written += 9; + uint32_t version_buffer = OUTPUT_VERSION; + out.write((char*) &version_buffer, sizeof(version_buffer)); + written += sdsl::write_member(seq_count, out, child, "seq_count"); + written += seq_name_csa.serialize(out, child, "seq_name_csa"); + written += seq_name_cbv.serialize(out, child, "seq_name_cbv"); + written += seq_name_cbv_rank.serialize(out, child, "seq_name_cbv_rank"); + written += seq_name_cbv_select.serialize(out, child, "seq_name_cbv_select"); + written += seq_begin_cbv.serialize(out, child, "seq_begin_cbv"); + written += seq_begin_cbv_rank.serialize(out, child, "seq_begin_cbv_rank"); + written += seq_begin_cbv_select.serialize(out, child, "seq_begin_cbv_select"); + out.close(); + open_seq(seqfilename); + return written; +} + +void seqindex_t::remove_index_files() { + std::remove(seqfilename.c_str()); + std::remove(seqidxfile.c_str()); +} + +void seqindex_t::open_seq(const std::string& filename) { + if (seq_fd) return; //open + assert(!filename.empty()); + // open in binary mode as we are reading from this interface + seq_fd = open(filename.c_str(), O_RDWR); + if (seq_fd == -1) { + assert(false); + } + struct stat stats; + if (-1 == fstat(seq_fd, &stats)) { + assert(false); + } + seq_size = stats.st_size; + if (!(seq_buf = + (char*) mmap(NULL, + seq_size, + PROT_READ | PROT_WRITE, + MAP_SHARED, + seq_fd, + 0))) { + assert(false); + } + madvise((void*)seq_buf, seq_size, POSIX_MADV_WILLNEED | POSIX_MADV_SEQUENTIAL); +} + +void seqindex_t::close_seq(void) { + if (seq_buf) { + munmap(seq_buf, seq_size); + seq_buf = 0; + } + if (seq_fd) { + close(seq_fd); + seq_fd = 0; + } +} + +void seqindex_t::load(const std::string& filename) { + set_base_filename(); + std::ifstream in(seqidxfile.c_str()); + std::string magic; + in.read((char*)magic.c_str(), 6); + uint32_t version; + in.read((char*) &version, sizeof(version)); + assert(version == OUTPUT_VERSION); + sdsl::read_member(seq_count, in); + seq_name_csa.load(in); + seq_name_cbv.load(in); + seq_name_cbv_rank.load(in); + seq_name_cbv_select.load(in); + seq_begin_cbv.load(in); + seq_begin_cbv_rank.load(in); + seq_begin_cbv_select.load(in); + in.close(); // close the sdsl index input + open_seq(filename); +} + +void seqindex_t::to_fasta(std::ostream& out, size_t linewidth) const { + // extract the sequence names + for (size_t i = 1; i < seq_count+1; ++i) { + auto name = nth_name(i); + out << ">" << name << std::endl; + // pad sequence + size_t seq_length = nth_seq_length(i); + // for chunk of 80 up to sequence length + //out << subseq(name, 0, linewidth) << std::endl; + for (size_t j = 0; j < seq_length; j += linewidth) { + out << subseq(name, j, std::min(linewidth, seq_length - j)) << std::endl; + } + } + // iterate through the sequence names and extract the sequences +} + +std::string seqindex_t::nth_name(size_t n) const { + // get the extents from our seq name dictionary + size_t begin = seq_name_cbv_select(n)+1; // step past '>' delimiter + size_t end = seq_name_cbv_select(n+1)-2; // step back past added ' ' + std::string name = sdsl::extract(seq_name_csa, begin, end); + return name; +} + +size_t seqindex_t::rank_of_seq_named(const std::string& name) const { + std::string query = ">" + name + " "; + //std::cerr << query << std::endl; + auto occs = locate(seq_name_csa, query); + //std::cerr << "occurs " << occs << std::endl; + assert(occs.size() == 1); + return seq_name_cbv_rank(occs[0])+1; +} + +size_t seqindex_t::nth_seq_length(size_t n) const { + //std::cerr << "trying for " << n << std::endl; + return seq_begin_cbv_select(n+1)-seq_begin_cbv_select(n); +} + +size_t seqindex_t::nth_seq_offset(size_t n) const { + return seq_begin_cbv_select(n); +} + +std::string seqindex_t::seq(const std::string& name) const { + return subseq(name, 0, nth_seq_length(rank_of_seq_named(name))); +} + +std::string seqindex_t::subseq(const std::string& name, size_t pos, size_t count) const { + size_t n = rank_of_seq_named(name); + return subseq(n, pos, count); +} + +std::string seqindex_t::subseq(size_t n, size_t pos, size_t count) const { + return subseq(nth_seq_offset(n)+pos, count); +} + +std::string seqindex_t::subseq(size_t pos, size_t count) const { + std::string s; s.resize(count); + memcpy((void*)s.c_str(), &seq_buf[pos], count); + return s; +} + +size_t seqindex_t::pos_in_all_seqs(const std::string& name, size_t pos, bool is_rev) const { + return pos_in_all_seqs(rank_of_seq_named(name), pos, is_rev); +} + +size_t seqindex_t::pos_in_all_seqs(size_t n, size_t pos, bool is_rev) const { + //std::cerr << "nth seq length " << nth_seq_length(n) << " offset " << nth_seq_offset(n) << std::endl; + return nth_seq_offset(n) + (is_rev ? nth_seq_length(n)-1-pos : pos); +} + +size_t seqindex_t::seq_length(void) const { + return seq_begin_cbv.size()-1; +} + +char seqindex_t::at(size_t pos) const { + return seq_buf[pos]; +} + +char seqindex_t::at_pos(pos_t pos) const { + // assumes 0-based pos + char c = at(offset(pos)); + if (is_rev(pos)) { + c = dna_reverse_complement(c); + } + return c; +} + +size_t seqindex_t::n_seqs(void) const { + return seq_count; +} + +size_t seqindex_t::seq_id_at(size_t pos) const { + return seq_begin_cbv_rank(pos+1); +} + +bool seqindex_t::seq_start(size_t pos) const { + return seq_begin_cbv[pos] == 1; +} + +} diff --git a/src/seqindex.hpp b/src/seqindex.hpp new file mode 100644 index 00000000..5cc6c42d --- /dev/null +++ b/src/seqindex.hpp @@ -0,0 +1,81 @@ +#ifndef SEQINDEX_HPP_INCLUDED +#define SEQINDEX_HPP_INCLUDED + +#include +#include +#include +#include +#include +#include +#include "sdsl/bit_vectors.hpp" +#include "sdsl/csa_wt.hpp" +#include "sdsl/suffix_arrays.hpp" +#include "sdsl/dac_vector.hpp" +#include "gzstream.h" +#include "pos.hpp" +#include "dna.hpp" +#include "xg.hpp" + +namespace smoothxg { + +class seqindex_t { + +private: + + std::string basefilename; + std::string seqfilename; + std::string seqnamefile; + std::string seqidxfile; + size_t seq_count = 0; + // a file containing the concatenated sequences + //std::vector seqfiles; + char* seq_buf; + int seq_fd = 0; + size_t seq_size = 0; + void open_seq(const std::string& name); + void close_seq(void); + //std::ifstream& get_seqfile(void); + // sequence offsets (for offset and length) + sdsl::sd_vector<> seq_begin_cbv; + sdsl::sd_vector<>::rank_1_type seq_begin_cbv_rank; + sdsl::sd_vector<>::select_1_type seq_begin_cbv_select; + // seq name compressed suffix array + sdsl::csa_wt<> seq_name_csa; + // seq name index + sdsl::sd_vector<> seq_name_cbv; + sdsl::sd_vector<>::rank_1_type seq_name_cbv_rank; + sdsl::sd_vector<>::select_1_type seq_name_cbv_select; + uint32_t OUTPUT_VERSION = 1; // update as we change our format + +public: + + seqindex_t(void) { } + ~seqindex_t(void) { close_seq(); } + void set_base_filename(); + void build_index(const xg::XG &graph); + size_t save(sdsl::structure_tree_node* s = NULL, const std::string& name = ""); + void load(const std::string& filename); + void remove_index_files(void); + void to_fasta(std::ostream& out, size_t linewidth = 60) const; + std::string nth_name(size_t n) const; + size_t rank_of_seq_named(const std::string& name) const; + size_t nth_seq_length(size_t n) const; + size_t nth_seq_offset(size_t n) const; + std::string seq(const std::string& name) const; + std::string subseq(const std::string& name, size_t pos, size_t count) const; + std::string subseq(size_t n, size_t pos, size_t count) const; + std::string subseq(size_t pos, size_t count) const; + size_t pos_in_all_seqs(const std::string& name, size_t pos, bool is_rev) const; + size_t pos_in_all_seqs(size_t n, size_t pos, bool is_rev) const; + size_t seq_length(void) const; + char at(size_t pos) const; + char at_pos(pos_t pos) const; + size_t n_seqs(void) const; + size_t seq_id_at(size_t pos) const; + bool seq_start(size_t pos) const; + +}; + +} + +#endif diff --git a/src/smooth.cpp b/src/smooth.cpp index c1b02ec8..e5268ef5 100644 --- a/src/smooth.cpp +++ b/src/smooth.cpp @@ -1527,8 +1527,15 @@ void _write_merged_maf_blocks( */ } -odgi::graph_t* smooth_and_lace(const xg::XG &graph, +void smooth_and_lace(const xg::XG &graph, blockset_t*& blockset, + mmmulti::set &path_mapping, + std::vector &block_graphs, + std::vector &consensus_mapping, + std::vector> &merged_block_id_intervals_tree_vector, + std::vector &block_id_ranges_vector, + ska::flat_hash_set &inverted_merged_block_id_intervals_ranks, + std::vector &is_block_in_a_merged_group, int poa_m, int poa_n, int poa_g, int poa_e, int poa_q, int poa_c, @@ -1555,54 +1562,11 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, // // record the start and end points of all the path ranges and the consensus // - uint64_t block_count = blockset->size(); - auto _block_graphs = std::make_unique>(block_count, nullptr); - auto& block_graphs = *_block_graphs; // get a ref - - auto get_block_graph = - [&](const uint64_t& block_id) { - std::string data; - zstdutil::DecompressString(*block_graphs[block_id], data); - stringstream ss; - ss << data; - ss.seekg(0,std::ios_base::beg); - auto block_graph = std::make_unique(); - block_graph->deserialize_members(ss); - return block_graph; - }; - - auto save_block_graph = - [&](const uint64_t& block_id, - const odgi::graph_t* block_graph) { - std::stringstream ss; - block_graph->serialize_members(ss); - std::string*& s = block_graphs[block_id]; - if (s == nullptr) { - s = new std::string; - } else { - s->clear(); - } - zstdutil::CompressString(ss.str(), *s); - }; - - // mapping from path fragments to block graphs - auto _path_mapping_tmp = temp_file::create(); - auto path_mapping_ptr = std::make_unique>(_path_mapping_tmp); - auto& path_mapping = *path_mapping_ptr; - path_mapping.open_writer(); - - // mapping from block to consensus ids - std::vector consensus_mapping(add_consensus ? blockset->size() : 0); - - std::vector> merged_block_id_intervals_tree_vector; - std::vector block_id_ranges_vector; - ska::flat_hash_set inverted_merged_block_id_intervals_ranks; // IITree can't store inverted intervals + std::atomic num_flipped_graphs(0); atomicbitvector::atomic_bv_t blok_to_flip(blockset->size()); - std::vector is_block_in_a_merged_group((add_consensus && merge_blocks) ? blockset->size() : 0); - #ifdef POA_DEBUG std::vector> block2stats(blockset->size()); #endif @@ -1836,7 +1800,7 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, // quietly groom by flipping the block to prefer the forward orientation of the lowest-ranked path // - auto block_graph = get_block_graph(block_id); + auto block_graph = get_block_graph(block_graphs, block_id); uint64_t first_id = std::numeric_limits::max(); path_handle_t groom_target_path; block_graph->for_each_path_handle( @@ -2311,7 +2275,7 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, consensus_mapping[block_id] = block_graph->get_path_handle(consensus_name); } } - save_block_graph(block_id, block_graph); + save_block_graph(block_graphs, block_id, block_graph); delete block_graph; poa_progress.increment(1); if (produce_maf || (add_consensus && merge_blocks)){ @@ -2359,6 +2323,7 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, // Flip graphs if (num_flipped_graphs > 0){ + const uint64_t block_count = blockset->size(); std::stringstream flip_graphs_banner; flip_graphs_banner << smoothxg_iter << "::smooth_and_lace] flipping " << num_flipped_graphs << " block graphs:"; progress_meter::ProgressMeter flip_graphs_progress(block_count, flip_graphs_banner.str()); @@ -2370,7 +2335,7 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, ska::flat_hash_map forward_translation; - auto block_graph = get_block_graph(block_id); + auto block_graph = get_block_graph(block_graphs, block_id); // make the flipped nodes block_graph->for_each_handle( @@ -2434,7 +2399,7 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, }); - save_block_graph(block_id, flipped_graph); + save_block_graph(block_graphs, block_id, flipped_graph); delete flipped_graph; flip_graphs_progress.increment(1); @@ -2444,389 +2409,6 @@ odgi::graph_t* smooth_and_lace(const xg::XG &graph, } else { std::cerr << smoothxg_iter << "::smooth_and_lace] flipping 0 block graphs" << std::endl; } - - std::cerr << smoothxg_iter << "::smooth_and_lace] sorting path fragments" << std::endl; - // sort the path range mappings by path handle id, then start position - // this will allow us to walk through them in order - /* - ips4o::parallel::sort( - path_mapping.begin(), path_mapping.end(), - [](const path_position_range_t &a, const path_position_range_t &b) { - auto &a_id = as_integer(get_base_path(a)); - auto &b_id = as_integer(get_base_path(b)); - return (a_id < b_id || a_id == b_id && get_start_pos(a) < get_start_pos(b)); - }); - */ - path_mapping.index(n_threads); - std::cerr << smoothxg_iter << "::smooth_and_lace] sorted " << path_mapping.size() << " path fragments" << std::endl; - - // build the sequence and edges into the output graph - auto* smoothed = new odgi::graph_t(); - std::vector paths; // for parallel iteration - - // add the nodes and edges to the graph - { - std::vector id_mapping; - - std::stringstream load_graphs_banner; - load_graphs_banner << smoothxg_iter << "::smooth_and_lace] loading " << block_count << " graph blocks:"; - progress_meter::ProgressMeter load_graphs_progress(block_count, load_graphs_banner.str()); - std::vector> graphs(block_count); -#pragma omp parallel for schedule(dynamic,1) - for (uint64_t idx = 0; idx < block_count; ++idx) { - std::string data; - zstdutil::DecompressString(*block_graphs[idx], data); - stringstream ss; - ss << data; - ss.seekg(0,std::ios_base::beg); - graphs[idx] = std::make_unique(); - graphs[idx]->deserialize_members(ss); - delete block_graphs[idx]; - load_graphs_progress.increment(1); - } - load_graphs_progress.finish(); - _block_graphs.reset(nullptr); // we've decompressed these, now clear our block graphs - - std::stringstream add_graph_banner; - add_graph_banner << smoothxg_iter << "::smooth_and_lace] adding nodes from " << block_count << " graphs:"; - progress_meter::ProgressMeter add_graph_progress(block_count, add_graph_banner.str()); - - for (uint64_t idx = 0; idx < block_count; ++idx) { - uint64_t id_trans = smoothed->get_node_count(); - // record the id translation - auto& block = graphs[idx]; - id_mapping.push_back(id_trans); - if (block->get_node_count() == 0) { - continue; - } - block->for_each_handle([&](const handle_t &h) { - smoothed->create_handle(block->get_sequence(h)); - }); - add_graph_progress.increment(1); - } - add_graph_progress.finish(); - - std::stringstream add_edges_banner; - add_edges_banner << smoothxg_iter << "::smooth_and_lace] adding edges from " << block_count << " graphs:"; - progress_meter::ProgressMeter add_edges_progress(block_count, add_edges_banner.str()); - for (uint64_t idx = 0; idx < block_count; ++idx) { - auto& id_trans = id_mapping[idx]; - auto& block = graphs[idx]; - block->for_each_edge([&](const edge_t &e) { - smoothed->create_edge( - smoothed->get_handle(id_trans + block->get_id(e.first)), - smoothed->get_handle(id_trans + block->get_id(e.second))); - }); - add_edges_progress.increment(1); - } - add_edges_progress.finish(); - - // then for each path, ensure that it's embedded in the graph by walking through - // its block segments in order and linking them up in the output graph - std::stringstream lace_banner; - lace_banner << smoothxg_iter << "::smooth_and_lace] embedding " << path_mapping.size() << " path fragments:"; - progress_meter::ProgressMeter lace_progress(path_mapping.size(), lace_banner.str()); - for (uint64_t i = 0; i < path_mapping.size(); ++i) { - path_position_range_t pos_range = path_mapping.read_value(i); - step_handle_t last_step = {0, 0}; - bool first = true; - uint64_t last_end_pos = 0; - // add the path to the graph - - path_handle_t smoothed_path = smoothed->create_path_handle( - graph.get_path_name(get_base_path(pos_range))); - // walk the path from start to end - while (true) { - // if we find a segment that's not included in any block, we'll add - // it to the final graph and link it in to do so, we detect a gap in - // length, collect the sequence in the gap and add it to the graph - // as a node then add it as a traversal to the path - if (get_start_pos(pos_range) - last_end_pos > 0) { - assert(false); // assert that we've included all sequence in blocks - } - // write the path steps into the graph using the id translation - auto block_id = get_block_id(pos_range); - auto& block = graphs[block_id]; - auto id_trans = id_mapping.at(block_id); - block->for_each_step_in_path( - get_target_path(pos_range), [&](const step_handle_t &step) { - handle_t h = block->get_handle_of_step(step); - handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, - block->get_is_reverse(h)); - smoothed->append_step(smoothed_path, t); - if (first) { - first = false; - // create edge between last and curr - if (as_integers(last_step)[0] != 0) { - smoothed->create_edge( - smoothed->get_handle_of_step(last_step), t); - } - } - }); - last_step = smoothed->path_back(smoothed_path); - last_end_pos = get_end_pos(pos_range); - if (i + 1 == path_mapping.size() || - get_base_path(path_mapping.read_value(i + 1)) != get_base_path(pos_range)) { - break; - } else { - ++i; - pos_range = path_mapping.read_value(i); - } - lace_progress.increment(1); - } - // now add in any final sequence in the path - // and add it to the path, add the edge - if (graph.get_path_length(get_base_path(pos_range)) > last_end_pos) { - assert(false); // assert that we've included all sequence in the blocks - } - } - lace_progress.finish(); - - path_mapping.close_reader(); - std::remove(_path_mapping_tmp.c_str()); - path_mapping_ptr.reset(nullptr); - - // now verify that smoothed has paths that are equal to the base graph - // and that all the paths are fully embedded in the graph - smoothed->for_each_path_handle( - [&](const path_handle_t &path) { - paths.push_back(path); - }); - - { - std::stringstream validate_banner; - validate_banner << smoothxg_iter << "::smooth_and_lace] validating " << paths.size() << " path sequences:"; - progress_meter::ProgressMeter validate_progress(paths.size(), validate_banner.str()); - -#pragma omp parallel for schedule(dynamic,1) - for (uint64_t i = 0; i < paths.size(); ++i) { - auto path = paths[i]; - - std::string orig_seq, smoothed_seq; - graph.for_each_step_in_path( - graph.get_path_handle(smoothed->get_path_name(path)), - [&](const step_handle_t &step) { - orig_seq.append(graph.get_sequence(graph.get_handle_of_step(step))); - }); - smoothed->for_each_step_in_path( - path, - [&](const step_handle_t &step) { - smoothed_seq.append(smoothed->get_sequence(smoothed->get_handle_of_step(step))); - }); - if (orig_seq != smoothed_seq) { - std::cerr << smoothxg_iter << "] error! path " - << smoothed->get_path_name(path) - << " was corrupted in the smoothed graph" << std::endl - << "original\t" << orig_seq << std::endl - << "smoothed\t" << smoothed_seq << std::endl; - exit(1); - } - - validate_progress.increment(1); - } - validate_progress.finish(); - } - - if (!consensus_mapping.empty()) { - std::cerr << smoothxg_iter << "::smooth_and_lace] sorting consensus" << std::endl; - - // consensus path and connections - - // by definition, the consensus paths are embedded in our blocks, which simplifies - // things we'll still need to add a new path for each consensus path - - // flag the blocks that we should include unmerged - atomicbitvector::atomic_bv_t exclude_unmerged_consensus(block_count); - - // Is there something merged? - if (!merged_block_id_intervals_tree_vector.empty()) { -#pragma omp parallel for schedule(dynamic,1) - for (auto& merged_block_id_intervals_tree : merged_block_id_intervals_tree_vector) { - merged_block_id_intervals_tree.index(); - } - - if (!preserve_unmerged_consensus) { - std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: removing redundant single consensus" << std::endl; - -#pragma omp parallel for schedule(dynamic,1) - for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { - if (is_block_in_a_merged_group[id]) { - exclude_unmerged_consensus.set(id); - } - } - } - } - - // all raw consensus paths - std::vector consensus_paths(block_count); - - // Unmerged consensus sequences - // First, create the path handles - std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: creating path handles" << std::endl; - for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { - //for (auto &pos_range : consensus_mapping) { - if (!exclude_unmerged_consensus.test(id)) { - auto& block = graphs[id]; - consensus_paths[id] = smoothed->create_path_handle( - block->get_path_name(consensus_mapping[id])); - } // else skip the embedding of the single consensus sequences - } - - // Next, add the steps - std::cerr << smoothxg_iter << "::smooth_and_lace] embedding consensus: creating step handles" << std::endl; -#pragma omp parallel for schedule(dynamic,1) - for (uint64_t id = 0; id < consensus_mapping.size(); ++id) { - //for(auto& pos_range : consensus_mapping){ - if (exclude_unmerged_consensus.test(id)) { - continue; // skip the embedding for the single consensus sequence - } - auto& block = graphs[id]; - path_handle_t smoothed_path = consensus_paths[id]; - auto &id_trans = id_mapping[id]; - block->for_each_step_in_path(consensus_mapping[id], [&](const step_handle_t &step) { - handle_t h = block->get_handle_of_step(step); - handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, block->get_is_reverse(h)); - smoothed->append_step(smoothed_path, t); - // nb: by definition of our construction of smoothed - // the consensus paths should have all their edges embedded - }); - } - - // Merged consensus sequences - if (!merged_block_id_intervals_tree_vector.empty()) { - // First, create the path handles - std::cerr << smoothxg_iter << "::smooth_and_lace] embedding merged consensus: creating path handles" << std::endl; - std::vector merged_consensus_paths; - - for (auto &block_id_ranges : block_id_ranges_vector) { - assert(!smoothed->has_path(consensus_base_name + block_id_ranges)); - merged_consensus_paths.push_back( - smoothed->create_path_handle(consensus_base_name + block_id_ranges) - ); - } - - // Next, add the steps - std::cerr << smoothxg_iter << "::smooth_and_lace] embedding merged consensus: creating step handles" << std::endl; - std::mutex consensus_path_is_merged_mutex; - ska::flat_hash_set consensus_path_is_merged; - assert(merged_block_id_intervals_tree_vector.size() == block_id_ranges_vector.size()); - -#pragma omp parallel for schedule(dynamic,1) - for (uint64_t i = 0; i < merged_block_id_intervals_tree_vector.size(); ++i) { - auto &merged_block_id_intervals_tree = merged_block_id_intervals_tree_vector[i]; - - bool inverted_intervals = inverted_merged_block_id_intervals_ranks.count(i) != 0; - path_handle_t consensus_path = merged_consensus_paths[i]; - - std::vector merged_block_id_intervals; - merged_block_id_intervals_tree.overlap(0, block_count, merged_block_id_intervals); - - uint64_t start_interval = 0; - uint64_t end_interval = merged_block_id_intervals.size() - 1; - int8_t step_interval = 1; - int8_t step = 1; - if (inverted_intervals) { - start_interval = merged_block_id_intervals.size() - 1; - end_interval = 0; - step_interval = -1; - step = -1; - } - - for (uint64_t j = start_interval; j != (end_interval + step_interval); j += step_interval) { - auto &merged_block_id_interval_idx = merged_block_id_intervals[j]; - - uint64_t start = merged_block_id_intervals_tree.start(merged_block_id_interval_idx); - uint64_t end = merged_block_id_intervals_tree.end(merged_block_id_interval_idx) - 1; - if (inverted_intervals){ - uint64_t tmp = start; - start = end; - end = tmp; - - /*{ - std::lock_guard guard(consensus_path_is_merged_mutex); - - std::cerr << i << ": start-end " << start << "-" << end < guard(consensus_path_is_merged_mutex); - - consensus_path_is_merged.insert(as_integer(consensus_paths[block_id])); - } - - auto& block = graphs[block_id]; - auto& id_trans = id_mapping[block_id]; - block->for_each_step_in_path( - consensus_mapping[block_id], - [&](const step_handle_t &step) { - handle_t h = block->get_handle_of_step(step); - handle_t t = smoothed->get_handle(block->get_id(h) + id_trans, block->get_is_reverse(h)); - smoothed->append_step(consensus_path, t); - }); - } - } - - clear_string(block_id_ranges_vector[i]); - } - - // now for each consensus path that's not been merged, and for each merged consensus path... - // record our path handles for later use in consensus graph generation - - consensus_paths.erase( - std::remove_if( - consensus_paths.begin(), consensus_paths.end(), - [&consensus_path_is_merged](const path_handle_t& path) { - return consensus_path_is_merged.count(as_integer(path)) > 0; - }), - consensus_paths.end()); - - consensus_paths.reserve( - consensus_paths.size() - + std::distance(merged_consensus_paths.begin(), - merged_consensus_paths.end())); - consensus_paths.insert( - consensus_paths.end(), - merged_consensus_paths.begin(), - merged_consensus_paths.end()); - - } - - // todo: validate the consensus paths as well - - consensus_path_names.reserve(consensus_paths.size()); - for (auto &path : consensus_paths) { - consensus_path_names.push_back(smoothed->get_path_name(path)); - } - } - } - - { - std::stringstream embed_banner; - embed_banner << smoothxg_iter << "::smooth_and_lace] walking edges in " - << paths.size() << " paths:"; - progress_meter::ProgressMeter embed_progress(paths.size(), embed_banner.str()); - // embed all paths in the graph to ensure validity - smoothed->for_each_path_handle( - [&](const path_handle_t& path) { - handle_t last; - step_handle_t begin_step = smoothed->path_begin(path); - smoothed->for_each_step_in_path( - path, - [&](const step_handle_t &step) { - handle_t h = smoothed->get_handle_of_step(step); - if (step != begin_step) { - smoothed->create_edge(last, h); - } - last = h; - }); - embed_progress.increment(1); - }); - embed_progress.finish(); - } - - return smoothed; } void build_odgi_abPOA(abpoa_t *ab, abpoa_para_t *abpt, odgi::graph_t* output, diff --git a/src/smooth.hpp b/src/smooth.hpp index fdcebdbd..8b856f9b 100644 --- a/src/smooth.hpp +++ b/src/smooth.hpp @@ -13,7 +13,6 @@ #include "spoa/spoa.hpp" #include "xg.hpp" #include "utils.hpp" -#include "zstdutil.hpp" //#include "patchmap.hpp" #include "flat_hash_map.hpp" #include @@ -98,8 +97,15 @@ odgi::graph_t* smooth_spoa(const xg::XG &graph, const block_t &block, uint64_t b #endif const std::string &consensus_name = ""); -odgi::graph_t* smooth_and_lace(const xg::XG &graph, +void smooth_and_lace(const xg::XG &graph, blockset_t*& blockset, + mmmulti::set &path_mapping, + std::vector &block_graphs, + std::vector &consensus_mapping, + std::vector> &merged_block_id_intervals_tree_vector, + std::vector &block_id_ranges_vector, + ska::flat_hash_set &inverted_merged_block_id_intervals_ranks, + std::vector &is_block_in_a_merged_group, int poa_m, int poa_n, int poa_g, int poa_e, int poa_q, int poa_c, diff --git a/src/utils.cpp b/src/utils.cpp index a6fc4028..462fb9d7 100644 --- a/src/utils.cpp +++ b/src/utils.cpp @@ -75,4 +75,31 @@ double handy_parameter(const std::string& value, const double default_value) { return is_a_number(tmp) ? (stod(tmp) * pow(10, exp)) : default_value; } +std::unique_ptr get_block_graph(std::vector &block_graphs, const uint64_t &block_id) { + std::string data; + zstdutil::DecompressString(*block_graphs[block_id], data); + stringstream ss; + ss << data; + ss.seekg(0,std::ios_base::beg); + auto block_graph = std::make_unique(); + block_graph->deserialize_members(ss); + return block_graph; +} + +void save_block_graph(std::vector &block_graphs, const uint64_t &block_id, const odgi::graph_t *block_graph){ + std::stringstream ss; + block_graph->serialize_members(ss); + std::string*& s = block_graphs[block_id]; + if (s == nullptr) { + s = new std::string; + } else { + s->clear(); + } + zstdutil::CompressString(ss.str(), *s); +} + +uint64_t modulo(const uint64_t n, const uint64_t d) { + return (n & (d - 1)); +} + } diff --git a/src/utils.hpp b/src/utils.hpp index 4771bbe3..6cefe673 100644 --- a/src/utils.hpp +++ b/src/utils.hpp @@ -5,6 +5,7 @@ #include #include #include +#include "zstdutil.hpp" namespace smoothxg { @@ -26,4 +27,10 @@ void graph_deep_copy(odgi::graph_t* source, double handy_parameter(const std::string& value, const double default_value); +std::unique_ptr get_block_graph(std::vector &block_graphs, const uint64_t &block_id); + +void save_block_graph(std::vector &block_graphs, const uint64_t &block_id, const odgi::graph_t *block_graph); + +uint64_t modulo(const uint64_t n, const uint64_t d); + }