From 7b1e135d7f38e05a3bb5c40682b5c80ef047ce16 Mon Sep 17 00:00:00 2001 From: Yenaled Date: Mon, 16 Sep 2024 22:23:45 -0700 Subject: [PATCH] union, em priors (fix), shade, and no-jump features --- src/KmerIndex.cpp | 114 ++++++++++++++++++++++++++++++++++++++++++- src/KmerIndex.h | 14 ++++++ src/MinCollector.cpp | 90 ++++++++++++++++++++++++++++++++-- src/MinCollector.h | 1 + src/ProcessReads.cpp | 37 ++++++++++++-- src/common.h | 6 ++- src/main.cpp | 47 ++++++++++++++++-- 7 files changed, 297 insertions(+), 12 deletions(-) diff --git a/src/KmerIndex.cpp b/src/KmerIndex.cpp index d6de5b9c..666cfd50 100755 --- a/src/KmerIndex.cpp +++ b/src/KmerIndex.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include "ColoredCDBG.hpp" // --aa option helper functions @@ -231,6 +232,18 @@ std::pair KmerIndex::getECInfo() const { return std::make_pair(max_ec_len, cardinality_zero_encounters); } +// Begin Shading +std::pair shadedTargetName(std::string& name) { + if (name.find("_shade_") != std::string::npos) { + std::string name_header = "_shade_"; + std::string tname = name.substr(0, name.find(name_header)); + std::string variant = name.substr(name.find(name_header)+name_header.length(), name.size()); + return std::make_pair(tname,variant); // Return the target name and the the associated shade + } + return std::make_pair("",""); // Not a shade +} +// End Shading + void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) { // read input u_set_ unique_names; @@ -354,6 +367,18 @@ void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) } unique_names.insert(name); target_names_.push_back(name); + + // Begin Shading + auto shade_info = shadedTargetName(name); + if (shade_info.first != "") { + std::string tname = shade_info.first; + std::string variant = shade_info.second; + auto it = std::find(target_names_.begin(), target_names_.end(), tname); + if (it != target_names_.end()) { + shadeToColorTranscriptMap[target_names_.size()-1] = std::distance(target_names_.begin(), it); + } + } + // End Shading } } @@ -399,6 +424,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea size_t num_seqs = 0; int max_color = 0; u_set_ external_input_names; + // Begin Shading + std::set variants_set; // Ordered set to keep track of variants (i.e. colors with shades) + // End Shading for (int i = 0; i < opt.transfasta.size(); i++) { // Currently, this should only be one file auto fasta = opt.transfasta[i]; fp = opt.transfasta.size() == 1 && opt.transfasta[0] == "-" ? gzdopen(fileno(stdin), "r") : gzopen(fasta.c_str(), "r"); @@ -414,9 +442,23 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea continue; } int color = std::atoi(strname.c_str()); + // Begin Shading + std::string variant; + auto shade_info = shadedTargetName(strname); + if (shade_info.first != "") { + std::string tname = shade_info.first; + variant = shade_info.second; + color = std::atoi(tname.c_str()); + variants_set.insert(std::to_string(color) + "_shade_" + variant); + } + // End Shading external_input_names.insert(color); if (color > max_color) max_color = color; - of << ">" << std::to_string(color) << "\n" << str << std::endl; + of << ">" << std::to_string(color); + // Begin Shading + if (!variant.empty()) of << "_shade_" << variant; + // End Shading + of << "\n" << str << std::endl; num_seqs++; } gzclose(fp); @@ -437,6 +479,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea target_names_.push_back(std::to_string(i)); target_lens_.push_back(k); // dummy length (k-mer size) } + // Begin Shading + for (const auto& v : variants_set) { + num_trans++; // Each color-shade duo counts as an additional target + target_names_.push_back(v); + target_lens_.push_back(k); // dummy length (k-mer size) + } + if (num_trans != ncolors) { + std::cerr << "[build] Detected " << std::to_string(num_trans-ncolors) << " shades" << std::endl; + } + // End Shading std::cerr << "[build] Building graph from k-mers" << std::endl; BuildDeBruijnGraph(opt, tmp_file2, out); @@ -449,6 +501,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea std::vector > trinfos(dbg.size()); std::ifstream infile_a(tmp_file2); int current_color = 0; + // Begin Shading + std::string current_variant; + // End Shading std::string line; while (std::getline(infile_a, line)) { if (line.length() == 0) { @@ -458,6 +513,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea current_color = onlist_sequences.cardinality(); } else { current_color = std::atoi(line.c_str()+1); + // Begin Shading + current_variant = ""; + std::string name = line.substr(1); + auto shade_info = shadedTargetName(name); + if (shade_info.first != "") { + std::string tname = shade_info.first; + current_variant = shade_info.second; + current_color = std::atoi(tname.c_str()); + } + // End Shading } continue; } @@ -481,8 +546,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea tr.pos = (proc-um.len) | (!um.strand ? sense : missense); tr.start = um.dist; tr.stop = um.dist + um.len; - trinfos[n->id].push_back(tr); + + // Begin Shading + if (!current_variant.empty()) { + auto it = variants_set.find(std::to_string(current_color) + "_shade_" + current_variant); + assert(it != variants_set.end()); + tr.trid = ncolors + std::distance(variants_set.begin(), it); + trinfos[n->id].push_back(tr); + } + // End Shading } } infile_a.close(); @@ -995,6 +1068,13 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st tr.stop = um.dist + um.len; trinfos[n->id].push_back(tr); + // Begin Shading + auto it = shadeToColorTranscriptMap.find(tr.trid); + if (it != shadeToColorTranscriptMap.end()) { + tr.trid = shadeToColorTranscriptMap[tr.trid]; + trinfos[n->id].push_back(tr); // Add the color of the original transcript as well + } + // End Shading } j++; } @@ -1020,6 +1100,11 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st std::cerr << "[build] target de Bruijn graph has k-mer length " << dbg.getK() << " and minimizer length " << dbg.getG() << std::endl; std::cerr << "[build] target de Bruijn graph has " << dbg.size() << " contigs and contains " << dbg.nbKmers() << " k-mers " << std::endl; + // Begin Shading + if (shadeToColorTranscriptMap.size() != 0) { + std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl; + } + // End Shading } void KmerIndex::PopulateMosaicECs(std::vector >& trinfos) { @@ -1418,6 +1503,19 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) { in.read(buffer, tmp_size); target_names_.push_back(std::string(buffer)); + // Begin Shading + auto shade_info = shadedTargetName(target_names_[target_names_.size()-1]); + if (shade_info.first != "") { + std::string tname = shade_info.first; + std::string variant = shade_info.second; + auto it = std::find(target_names_.begin(), target_names_.end(), tname); + if (it != target_names_.end()) { + shadeToColorTranscriptMap[i] = std::distance(target_names_.begin(), it); + } + use_shade = true; + shade_sequences.add(i); + } + // End Shading } delete[] buffer; @@ -1438,6 +1536,11 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) { if (num_trans-onlist_sequences.cardinality() > 0) { std::cerr << "[index] number of D-list k-mers: " << pretty_num(static_cast(num_trans-onlist_sequences.cardinality())) << std::endl; } + // Begin Shading + if (shadeToColorTranscriptMap.size() != 0) { + std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl; + } + // End Shading in.close(); @@ -1594,6 +1697,11 @@ int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2) const { // post: v contains all equiv classes for the k-mers in s void KmerIndex::match(const char *s, int l, std::vector, int>>& v, bool partial, bool cfc) const{ const Node* n; + + // Begin Shading + if (use_shade) partial = false; + // End Shading + if (do_union) partial = false; // TODO: // Rework KmerIndex::match() such that it uses the following type of logic @@ -1664,6 +1772,8 @@ void KmerIndex::match(const char *s, int l, std::vectorsecond}); + + if (no_jump) continue; // Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto // unitig diff --git a/src/KmerIndex.h b/src/KmerIndex.h index 142e06ae..4e0fe32a 100755 --- a/src/KmerIndex.h +++ b/src/KmerIndex.h @@ -77,6 +77,11 @@ struct KmerIndex { //LoadTranscripts(opt.transfasta); load_positional_info = opt.bias || opt.pseudobam || opt.genomebam || !opt.single_overhang; dfk_onlist = opt.dfk_onlist; + do_union = opt.do_union; + no_jump = opt.no_jump; + // Begin Shading + use_shade = false; + // End Shading } ~KmerIndex() {} @@ -131,6 +136,8 @@ struct KmerIndex { std::vector target_names_; std::vector target_seqs_; // populated on demand bool dfk_onlist; // If we want to not use D-list in intersecting ECs + bool do_union; // If we want to do "pseudoalignment" via a "union" rather than an "intersection" + bool no_jump; // If we want to skip the jumping logic during pseudoalignment bool target_seqs_loaded; bool load_positional_info; // when should we load positional info in addition to strandedness @@ -141,6 +148,13 @@ struct KmerIndex { u_set_ d_list; Kmer dummy_dfk; const_UnitigMap um_dummy; + + // Begin Shading + // Here, we use the concepts of "shades" as proposed by in Ornaments by Adduri & Kim, 2024 for bias-corrected allele-specific expression estimation + std::unordered_map shadeToColorTranscriptMap; + Roaring shade_sequences; + bool use_shade; + // End Shading }; #endif // KALLISTO_KMERINDEX_H diff --git a/src/MinCollector.cpp b/src/MinCollector.cpp index 114bfe38..82a326b4 100644 --- a/src/MinCollector.cpp +++ b/src/MinCollector.cpp @@ -159,8 +159,15 @@ int MinCollector::modeKmers(std::vector, int32_t int MinCollector::intersectKmers(std::vector, int32_t>>& v1, std::vector, int32_t>>& v2, bool nonpaired, Roaring& r) const { - Roaring u1 = intersectECs(v1); - Roaring u2 = intersectECs(v2); + Roaring u1, u2; + if (!index.do_union) { + u1 = intersectECs(v1); + u2 = intersectECs(v2); + } else { + u1 = unionECs(v1); + u2 = unionECs(v2); + } + if (u1.isEmpty() && u2.isEmpty()) { return -1; @@ -183,12 +190,30 @@ int MinCollector::intersectKmers(std::vector, in if (index.dfk_onlist) { // In case we want to not intersect D-list targets includeDList(u1, u2, index.onlist_sequences); } + // Begin Shading + if (index.use_shade) u1 = u1 - index.shade_sequences; + if (index.use_shade) u2 = u2 - index.shade_sequences; + // End Shading r = u1 & u2; } if (r.isEmpty()) { return -1; } + // Begin Shading + if (index.use_shade) { + // Take the union of the shades + u1 = unionECs(v1); + u2 = unionECs(v2); + Roaring r_shade = (u1 | u2) & index.shade_sequences; + Roaring r_shade_final; + for (auto shade : r_shade) { // Make sure the shades correspond to the targets in the intersection + auto color = index.shadeToColorTranscriptMap[shade]; + if (r.contains(color)) r_shade_final.add(shade); + } + r |= r_shade_final; // Add the shades to the equivalence class + } + // End Shading return 1; } @@ -414,15 +439,21 @@ Roaring MinCollector::intersectECs(std::vector, r = v[0].first.getData()->ec[v[0].first.dist].getIndices(); + // Begin Shading + if (index.use_shade) r = r - index.shade_sequences; + // End Shading bool found_nonempty = !r.isEmpty(); Roaring lastEC = r; Roaring ec; - + for (int i = 1; i < v.size(); i++) { // Find a non-empty EC before we start taking the intersection if (!found_nonempty) { r = v[i].first.getData()->ec[v[i].first.dist].getIndices(); + // Begin Shading + if (index.use_shade) r = r - index.shade_sequences; + // End Shading found_nonempty = !r.isEmpty(); } @@ -430,6 +461,9 @@ Roaring MinCollector::intersectECs(std::vector, !(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) { ec = v[i].first.getData()->ec[v[i].first.dist].getIndices(); + // Begin Shading + if (index.use_shade) ec = ec - index.shade_sequences; + // End Shading // Don't intersect empty EC (because of thresholding) if (!(ec == lastEC) && !ec.isEmpty()) { @@ -457,7 +491,57 @@ Roaring MinCollector::intersectECs(std::vector, if ((maxpos-minpos + k) < min_range) { return {}; } + + return r; +} + +Roaring MinCollector::unionECs(std::vector, int32_t>>& v) const { + Roaring r; + if (v.empty()) { + return r; + } + sort(v.begin(), v.end(), [&](const std::pair, int>& a, const std::pair, int>& b) + { + if (a.first.isSameReferenceUnitig(b.first) && + a.first.getData()->ec[a.first.dist] == b.first.getData()->ec[b.first.dist]) { + return a.second < b.second; + } else { + return a.first.getData()->id < b.first.getData()->id; + } + }); // sort by contig, and then first position + + r = v[0].first.getData()->ec[v[0].first.dist].getIndices(); + bool found_nonempty = !r.isEmpty(); + Roaring lastEC = r; + Roaring ec; + + for (int i = 1; i < v.size(); i++) { + // Find a non-empty EC before we start taking the intersection + if (!found_nonempty) { + r = v[i].first.getData()->ec[v[i].first.dist].getIndices(); + found_nonempty = !r.isEmpty(); + } + + if (!v[i].first.isSameReferenceUnitig(v[i-1].first) || + !(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) { + ec = v[i].first.getData()->ec[v[i].first.dist].getIndices(); + r |= ec; + } + } + + // find the range of support + int minpos = std::numeric_limits::max(); + int maxpos = 0; + + for (auto& x : v) { + minpos = std::min(minpos, x.second); + maxpos = std::max(maxpos, x.second); + } + if ((maxpos-minpos + k) < min_range) { + return {}; + } + return r; } diff --git a/src/MinCollector.h b/src/MinCollector.h index dd6b4e75..e3fc3dcb 100644 --- a/src/MinCollector.h +++ b/src/MinCollector.h @@ -55,6 +55,7 @@ struct MinCollector { Roaring intersectECs(std::vector, int32_t>>& v) const; Roaring intersectECs_long(std::vector, int32_t>>& v) const; + Roaring unionECs(std::vector, int32_t>>& v) const; Roaring modeECs(std::vector, int32_t>>& v) const; int intersectKmersCFC(std::vector, int32_t>>& v1, std::vector, int32_t>>& v3, diff --git a/src/ProcessReads.cpp b/src/ProcessReads.cpp index d66e7f74..1d1e4fff 100755 --- a/src/ProcessReads.cpp +++ b/src/ProcessReads.cpp @@ -58,7 +58,28 @@ std::pair, int> findFirstMappingKmer(const std::vector, int32_t> >& v, const std::vector, int32_t> >& v2) { +void doStrandSpecificity(Roaring& u, const ProgramOptions::StrandType strand, const std::vector, int32_t> >& v, const std::vector, int32_t> >& v2, bool comprehensive = false) { + if (comprehensive) { // Comprehensive means we check every position's strand specificity (not used in standard usage) + Roaring final_u; + for (int i = 0; i < v.size(); i++) { + Roaring u_ = u; // for union and shades and no-jump + std::vector, int32_t> > v_; + std::vector, int32_t> > v2_; + v_.push_back(v[i]); + doStrandSpecificity(u_, strand, v_, v2, false); + final_u |= u_; + } + for (int i = 0; i < v2.size(); i++) { + Roaring u_ = u; // for union and shades and no-jump + std::vector, int32_t> > v_; + std::vector, int32_t> > v2_; + v2_.push_back(v2[i]); + doStrandSpecificity(u_, strand, v_, v2, false); + final_u |= u_; + } + u = final_u; + return; + } int p = -1; const_UnitigMap um; Roaring vtmp; @@ -1115,7 +1136,12 @@ void ReadProcessor::processBuffer() { } if (mp.opt.strand_specific && !u.isEmpty()) { - doStrandSpecificity(u, mp.opt.strand, v1, v2); + bool comprehensive = false; + if (mp.opt.do_union || mp.opt.no_jump) comprehensive = true; + // Begin Shading + if (index.use_shade) comprehensive = true; + // End Shading + doStrandSpecificity(u, mp.opt.strand, v1, v2, comprehensive); } // find the ec @@ -1700,7 +1726,12 @@ void BUSProcessor::processBuffer() { } if (doStrandSpecificityIfPossible && mp.opt.strand_specific && !u.isEmpty()) { // Strand-specificity - doStrandSpecificity(u, mp.opt.strand, v, v2); + bool comprehensive = false; + if (mp.opt.do_union || mp.opt.no_jump) comprehensive = true; + // Begin Shading + if (index.use_shade) comprehensive = true; + // End Shading + doStrandSpecificity(u, mp.opt.strand, v, v2, comprehensive); } // find the ec diff --git a/src/common.h b/src/common.h index 73fe54f3..a1782be8 100755 --- a/src/common.h +++ b/src/common.h @@ -135,6 +135,8 @@ struct ProgramOptions { bool make_unique; bool fusion; bool dfk_onlist; + bool do_union; + bool no_jump; enum class StrandType {None, FR, RF}; StrandType strand; std::string gfa; // used for inspect @@ -200,7 +202,9 @@ ProgramOptions() : single_overhang(false), aa(false), distinguish(false), - d_list_overhang(1) + d_list_overhang(1), + do_union(false), + no_jump(false) {} }; diff --git a/src/main.cpp b/src/main.cpp index 00bd8717..3d1097fb 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -221,6 +221,8 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) { int pbam_flag = 0; int gbam_flag = 0; int fusion_flag = 0; + int do_union_flag = 0; + int no_jump_flag = 0; const char *opt_string = "t:i:l:P:s:o:n:m:d:b:g:c:p:"; static struct option long_options[] = { @@ -251,6 +253,8 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) { {"gtf", required_argument, 0, 'g'}, {"chromosomes", required_argument, 0, 'c'}, {"priors", required_argument, 0, 'p'}, + {"union", no_argument, &do_union_flag, 1}, + {"no-jump", no_argument, &no_jump_flag, 1}, {0,0,0,0} }; int c; @@ -377,6 +381,14 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) { if (fusion_flag) { opt.fusion = true; } + + if (do_union_flag) { + opt.do_union = true; + } + + if (no_jump_flag) { + opt.no_jump = true; + } } void ParseOptionsTCCQuant(int argc, char **argv, ProgramOptions& opt) { @@ -539,6 +551,8 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) { int interleaved_flag = 0; int batch_barcodes_flag = 0; int dfk_onlist_flag = 0; + int do_union_flag = 0; + int no_jump_flag = 0; const char *opt_string = "i:o:x:t:lbng:c:T:P:r:e:B:N:"; static struct option long_options[] = { @@ -569,6 +583,8 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) { {"inleaved", no_argument, &interleaved_flag, 1}, {"numReads", required_argument, 0, 'N'}, {"batch-barcodes", no_argument, &batch_barcodes_flag, 1}, + {"union", no_argument, &do_union_flag, 1}, + {"no-jump", no_argument, &no_jump_flag, 1}, {0,0,0,0} }; @@ -732,6 +748,14 @@ void ParseOptionsBus(int argc, char **argv, ProgramOptions& opt) { opt.dfk_onlist = true; } + if (do_union_flag) { + opt.do_union = true; + } + + if (no_jump_flag) { + opt.no_jump = true; + } + opt.single_overhang = true; // throw warning when --aa is passed with paired-end arg @@ -941,6 +965,15 @@ bool CheckOptionsBus(ProgramOptions& opt) { std::cerr << "Threshold not in (0,1). Setting default threshold for unmapped kmers to 0.8" << std::endl; opt.threshold = 0.8; } + + if (opt.do_union && (opt.long_read || opt.aa)) { + std::cerr << "--union is not compatible with this mode" << std::endl; + ret = false; + } + if (opt.no_jump && (opt.long_read || opt.aa)) { + std::cerr << "--no-jump is not compatible with this mode" << std::endl; + ret = false; + } if (opt.long_read) { //opt.error_rate <= 0) { //hiding for release, not used for this version @@ -1626,6 +1659,16 @@ bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) { cerr << "Error: cannot supply mean/sd without supplying both -l and -s" << endl; ret = false; } + + + if (opt.do_union && (opt.long_read || opt.aa)) { + std::cerr << "--union is not compatible with this mode" << std::endl; + ret = false; + } + if (opt.no_jump && (opt.long_read || opt.aa)) { + std::cerr << "--no-jump is not compatible with this mode" << std::endl; + ret = false; + } if ((opt.single_end && !opt.long_read) && (opt.fld == 0.0 || opt.sd == 0.0)) { cerr << "Error: fragment length mean and sd must be supplied for single-end reads using -l and -s" << endl; @@ -2973,7 +3016,7 @@ int main(int argc, char *argv[]) { } EMAlgorithm em(collection.counts, index, collection, fl_means, opt); - em.set_priors(priors); + if (opt.priors != "") em.set_priors(priors); em.run(10000, 50, false, false); if (isMatrixFile) { // Update abundances matrix @@ -3135,8 +3178,6 @@ int main(int argc, char *argv[]) { } }; // end of EM_lambda - priors.clear(); - std::vector workers; int num_ids = nrow;