Skip to content

Commit

Permalink
union, em priors (fix), shade, and no-jump features
Browse files Browse the repository at this point in the history
  • Loading branch information
Yenaled committed Sep 17, 2024
1 parent c178de6 commit 7b1e135
Show file tree
Hide file tree
Showing 7 changed files with 297 additions and 12 deletions.
114 changes: 112 additions & 2 deletions src/KmerIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <iostream>
#include <unordered_map>
#include <string>
#include <set>
#include "ColoredCDBG.hpp"

// --aa option helper functions
Expand Down Expand Up @@ -231,6 +232,18 @@ std::pair<size_t,size_t> KmerIndex::getECInfo() const {
return std::make_pair(max_ec_len, cardinality_zero_encounters);
}

// Begin Shading
std::pair<std::string,std::string> shadedTargetName(std::string& name) {
if (name.find("_shade_") != std::string::npos) {
std::string name_header = "_shade_";
std::string tname = name.substr(0, name.find(name_header));
std::string variant = name.substr(name.find(name_header)+name_header.length(), name.size());
return std::make_pair(tname,variant); // Return the target name and the the associated shade
}
return std::make_pair("",""); // Not a shade
}
// End Shading

void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) {
// read input
u_set_<std::string> unique_names;
Expand Down Expand Up @@ -354,6 +367,18 @@ void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out)
}
unique_names.insert(name);
target_names_.push_back(name);

// Begin Shading
auto shade_info = shadedTargetName(name);
if (shade_info.first != "") {
std::string tname = shade_info.first;
std::string variant = shade_info.second;
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
if (it != target_names_.end()) {
shadeToColorTranscriptMap[target_names_.size()-1] = std::distance(target_names_.begin(), it);
}
}
// End Shading
}
}

Expand Down Expand Up @@ -399,6 +424,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
size_t num_seqs = 0;
int max_color = 0;
u_set_<int> external_input_names;
// Begin Shading
std::set<std::string> variants_set; // Ordered set to keep track of variants (i.e. colors with shades)
// End Shading
for (int i = 0; i < opt.transfasta.size(); i++) { // Currently, this should only be one file
auto fasta = opt.transfasta[i];
fp = opt.transfasta.size() == 1 && opt.transfasta[0] == "-" ? gzdopen(fileno(stdin), "r") : gzopen(fasta.c_str(), "r");
Expand All @@ -414,9 +442,23 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
continue;
}
int color = std::atoi(strname.c_str());
// Begin Shading
std::string variant;
auto shade_info = shadedTargetName(strname);
if (shade_info.first != "") {
std::string tname = shade_info.first;
variant = shade_info.second;
color = std::atoi(tname.c_str());
variants_set.insert(std::to_string(color) + "_shade_" + variant);
}
// End Shading
external_input_names.insert(color);
if (color > max_color) max_color = color;
of << ">" << std::to_string(color) << "\n" << str << std::endl;
of << ">" << std::to_string(color);
// Begin Shading
if (!variant.empty()) of << "_shade_" << variant;
// End Shading
of << "\n" << str << std::endl;
num_seqs++;
}
gzclose(fp);
Expand All @@ -437,6 +479,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
target_names_.push_back(std::to_string(i));
target_lens_.push_back(k); // dummy length (k-mer size)
}
// Begin Shading
for (const auto& v : variants_set) {
num_trans++; // Each color-shade duo counts as an additional target
target_names_.push_back(v);
target_lens_.push_back(k); // dummy length (k-mer size)
}
if (num_trans != ncolors) {
std::cerr << "[build] Detected " << std::to_string(num_trans-ncolors) << " shades" << std::endl;
}
// End Shading

std::cerr << "[build] Building graph from k-mers" << std::endl;
BuildDeBruijnGraph(opt, tmp_file2, out);
Expand All @@ -449,6 +501,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
std::vector<std::vector<TRInfo> > trinfos(dbg.size());
std::ifstream infile_a(tmp_file2);
int current_color = 0;
// Begin Shading
std::string current_variant;
// End Shading
std::string line;
while (std::getline(infile_a, line)) {
if (line.length() == 0) {
Expand All @@ -458,6 +513,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
current_color = onlist_sequences.cardinality();
} else {
current_color = std::atoi(line.c_str()+1);
// Begin Shading
current_variant = "";
std::string name = line.substr(1);
auto shade_info = shadedTargetName(name);
if (shade_info.first != "") {
std::string tname = shade_info.first;
current_variant = shade_info.second;
current_color = std::atoi(tname.c_str());
}
// End Shading
}
continue;
}
Expand All @@ -481,8 +546,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
tr.pos = (proc-um.len) | (!um.strand ? sense : missense);
tr.start = um.dist;
tr.stop = um.dist + um.len;

trinfos[n->id].push_back(tr);

// Begin Shading
if (!current_variant.empty()) {
auto it = variants_set.find(std::to_string(current_color) + "_shade_" + current_variant);
assert(it != variants_set.end());
tr.trid = ncolors + std::distance(variants_set.begin(), it);
trinfos[n->id].push_back(tr);
}
// End Shading
}
}
infile_a.close();
Expand Down Expand Up @@ -995,6 +1068,13 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
tr.stop = um.dist + um.len;

trinfos[n->id].push_back(tr);
// Begin Shading
auto it = shadeToColorTranscriptMap.find(tr.trid);
if (it != shadeToColorTranscriptMap.end()) {
tr.trid = shadeToColorTranscriptMap[tr.trid];
trinfos[n->id].push_back(tr); // Add the color of the original transcript as well
}
// End Shading
}
j++;
}
Expand All @@ -1020,6 +1100,11 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st

std::cerr << "[build] target de Bruijn graph has k-mer length " << dbg.getK() << " and minimizer length " << dbg.getG() << std::endl;
std::cerr << "[build] target de Bruijn graph has " << dbg.size() << " contigs and contains " << dbg.nbKmers() << " k-mers " << std::endl;
// Begin Shading
if (shadeToColorTranscriptMap.size() != 0) {
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
}
// End Shading
}

void KmerIndex::PopulateMosaicECs(std::vector<std::vector<TRInfo> >& trinfos) {
Expand Down Expand Up @@ -1418,6 +1503,19 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
in.read(buffer, tmp_size);

target_names_.push_back(std::string(buffer));
// Begin Shading
auto shade_info = shadedTargetName(target_names_[target_names_.size()-1]);
if (shade_info.first != "") {
std::string tname = shade_info.first;
std::string variant = shade_info.second;
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
if (it != target_names_.end()) {
shadeToColorTranscriptMap[i] = std::distance(target_names_.begin(), it);
}
use_shade = true;
shade_sequences.add(i);
}
// End Shading
}
delete[] buffer;

Expand All @@ -1438,6 +1536,11 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
if (num_trans-onlist_sequences.cardinality() > 0) {
std::cerr << "[index] number of D-list k-mers: " << pretty_num(static_cast<size_t>(num_trans-onlist_sequences.cardinality())) << std::endl;
}
// Begin Shading
if (shadeToColorTranscriptMap.size() != 0) {
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
}
// End Shading

in.close();

Expand Down Expand Up @@ -1594,6 +1697,11 @@ int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2) const {
// post: v contains all equiv classes for the k-mers in s
void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial, bool cfc) const{
const Node* n;

// Begin Shading
if (use_shade) partial = false;
// End Shading
if (do_union) partial = false;

// TODO:
// Rework KmerIndex::match() such that it uses the following type of logic
Expand Down Expand Up @@ -1664,6 +1772,8 @@ void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMa
}

v.push_back({um, kit->second});

if (no_jump) continue;

// Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto
// unitig
Expand Down
14 changes: 14 additions & 0 deletions src/KmerIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ struct KmerIndex {
//LoadTranscripts(opt.transfasta);
load_positional_info = opt.bias || opt.pseudobam || opt.genomebam || !opt.single_overhang;
dfk_onlist = opt.dfk_onlist;
do_union = opt.do_union;
no_jump = opt.no_jump;
// Begin Shading
use_shade = false;
// End Shading
}

~KmerIndex() {}
Expand Down Expand Up @@ -131,6 +136,8 @@ struct KmerIndex {
std::vector<std::string> target_names_;
std::vector<std::string> target_seqs_; // populated on demand
bool dfk_onlist; // If we want to not use D-list in intersecting ECs
bool do_union; // If we want to do "pseudoalignment" via a "union" rather than an "intersection"
bool no_jump; // If we want to skip the jumping logic during pseudoalignment
bool target_seqs_loaded;
bool load_positional_info; // when should we load positional info in addition to strandedness

Expand All @@ -141,6 +148,13 @@ struct KmerIndex {
u_set_<Kmer, KmerHash> d_list;
Kmer dummy_dfk;
const_UnitigMap<Node> um_dummy;

// Begin Shading
// Here, we use the concepts of "shades" as proposed by in Ornaments by Adduri & Kim, 2024 for bias-corrected allele-specific expression estimation
std::unordered_map<int, int> shadeToColorTranscriptMap;
Roaring shade_sequences;
bool use_shade;
// End Shading
};

#endif // KALLISTO_KMERINDEX_H
90 changes: 87 additions & 3 deletions src/MinCollector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,15 @@ int MinCollector::modeKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t

int MinCollector::intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v1,
std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v2, bool nonpaired, Roaring& r) const {
Roaring u1 = intersectECs(v1);
Roaring u2 = intersectECs(v2);
Roaring u1, u2;
if (!index.do_union) {
u1 = intersectECs(v1);
u2 = intersectECs(v2);
} else {
u1 = unionECs(v1);
u2 = unionECs(v2);
}


if (u1.isEmpty() && u2.isEmpty()) {
return -1;
Expand All @@ -183,12 +190,30 @@ int MinCollector::intersectKmers(std::vector<std::pair<const_UnitigMap<Node>, in
if (index.dfk_onlist) { // In case we want to not intersect D-list targets
includeDList(u1, u2, index.onlist_sequences);
}
// Begin Shading
if (index.use_shade) u1 = u1 - index.shade_sequences;
if (index.use_shade) u2 = u2 - index.shade_sequences;
// End Shading
r = u1 & u2;
}

if (r.isEmpty()) {
return -1;
}
// Begin Shading
if (index.use_shade) {
// Take the union of the shades
u1 = unionECs(v1);
u2 = unionECs(v2);
Roaring r_shade = (u1 | u2) & index.shade_sequences;
Roaring r_shade_final;
for (auto shade : r_shade) { // Make sure the shades correspond to the targets in the intersection
auto color = index.shadeToColorTranscriptMap[shade];
if (r.contains(color)) r_shade_final.add(shade);
}
r |= r_shade_final; // Add the shades to the equivalence class
}
// End Shading
return 1;
}

Expand Down Expand Up @@ -414,22 +439,31 @@ Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>,


r = v[0].first.getData()->ec[v[0].first.dist].getIndices();
// Begin Shading
if (index.use_shade) r = r - index.shade_sequences;
// End Shading
bool found_nonempty = !r.isEmpty();
Roaring lastEC = r;
Roaring ec;

for (int i = 1; i < v.size(); i++) {

// Find a non-empty EC before we start taking the intersection
if (!found_nonempty) {
r = v[i].first.getData()->ec[v[i].first.dist].getIndices();
// Begin Shading
if (index.use_shade) r = r - index.shade_sequences;
// End Shading
found_nonempty = !r.isEmpty();
}

if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
!(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {

ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
// Begin Shading
if (index.use_shade) ec = ec - index.shade_sequences;
// End Shading

// Don't intersect empty EC (because of thresholding)
if (!(ec == lastEC) && !ec.isEmpty()) {
Expand Down Expand Up @@ -457,7 +491,57 @@ Roaring MinCollector::intersectECs(std::vector<std::pair<const_UnitigMap<Node>,
if ((maxpos-minpos + k) < min_range) {
return {};
}

return r;
}

Roaring MinCollector::unionECs(std::vector<std::pair<const_UnitigMap<Node>, int32_t>>& v) const {
Roaring r;
if (v.empty()) {
return r;
}
sort(v.begin(), v.end(), [&](const std::pair<const_UnitigMap<Node>, int>& a, const std::pair<const_UnitigMap<Node>, int>& b)
{
if (a.first.isSameReferenceUnitig(b.first) &&
a.first.getData()->ec[a.first.dist] == b.first.getData()->ec[b.first.dist]) {
return a.second < b.second;
} else {
return a.first.getData()->id < b.first.getData()->id;
}
}); // sort by contig, and then first position

r = v[0].first.getData()->ec[v[0].first.dist].getIndices();
bool found_nonempty = !r.isEmpty();
Roaring lastEC = r;
Roaring ec;

for (int i = 1; i < v.size(); i++) {
// Find a non-empty EC before we start taking the intersection
if (!found_nonempty) {
r = v[i].first.getData()->ec[v[i].first.dist].getIndices();
found_nonempty = !r.isEmpty();
}

if (!v[i].first.isSameReferenceUnitig(v[i-1].first) ||
!(v[i].first.getData()->ec[v[i].first.dist] == v[i-1].first.getData()->ec[v[i-1].first.dist])) {
ec = v[i].first.getData()->ec[v[i].first.dist].getIndices();
r |= ec;
}
}

// find the range of support
int minpos = std::numeric_limits<int>::max();
int maxpos = 0;

for (auto& x : v) {
minpos = std::min(minpos, x.second);
maxpos = std::max(maxpos, x.second);
}

if ((maxpos-minpos + k) < min_range) {
return {};
}

return r;
}

Expand Down
Loading

0 comments on commit 7b1e135

Please sign in to comment.