Skip to content

Commit

Permalink
Merge pull request #461 from pachterlab/devel
Browse files Browse the repository at this point in the history
Devel
  • Loading branch information
Yenaled authored Sep 17, 2024
2 parents fa01edd + 2cb5e17 commit 0397342
Show file tree
Hide file tree
Showing 9 changed files with 322 additions and 18 deletions.
8 changes: 7 additions & 1 deletion ext/bifrost/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,13 @@ target_link_libraries(bifrost_static ${ZLIB_LIBRARIES})
target_link_libraries(bifrost_dynamic ${ZLIB_LIBRARIES})

if (ZLIB_FOUND)
include_directories(${ZLIB_INCLUDE_DIRS})
if (DEFINED ZLIB_INCLUDE_DIRS)
include_directories( ${ZLIB_INCLUDE_DIRS} )
elseif (DEFINED ZLIB_INCLUDE_DIR)
include_directories( ${ZLIB_INCLUDE_DIR} )
else()
message(FATAL_ERROR "zlib found but no include directories are set.")
endif()
else()
message(FATAL_ERROR "zlib not found. Required for to output files")
endif(ZLIB_FOUND)
Expand Down
8 changes: 7 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,13 @@ if (NOT ZLIBNG)
find_package( ZLIB REQUIRED )

if ( ZLIB_FOUND )
include_directories( ${ZLIB_INCLUDE_DIRS} )
if (DEFINED ZLIB_INCLUDE_DIRS)
include_directories( ${ZLIB_INCLUDE_DIRS} )
elseif (DEFINED ZLIB_INCLUDE_DIR)
include_directories( ${ZLIB_INCLUDE_DIR} )
else()
message(FATAL_ERROR "zlib found but no include directories are set.")
endif()
target_link_libraries(kallisto kallisto_core ${ZLIB_LIBRARIES})
else()
message(FATAL_ERROR "zlib not found. Required for to output files" )
Expand Down
114 changes: 112 additions & 2 deletions src/KmerIndex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <iostream>
#include <unordered_map>
#include <string>
#include <set>
#include "ColoredCDBG.hpp"

// --aa option helper functions
Expand Down Expand Up @@ -231,6 +232,18 @@ std::pair<size_t,size_t> KmerIndex::getECInfo() const {
return std::make_pair(max_ec_len, cardinality_zero_encounters);
}

// Begin Shading
std::pair<std::string,std::string> shadedTargetName(std::string& name) {
if (name.find("_shade_") != std::string::npos) {
std::string name_header = "_shade_";
std::string tname = name.substr(0, name.find(name_header));
std::string variant = name.substr(name.find(name_header)+name_header.length(), name.size());
return std::make_pair(tname,variant); // Return the target name and the the associated shade
}
return std::make_pair("",""); // Not a shade
}
// End Shading

void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out) {
// read input
u_set_<std::string> unique_names;
Expand Down Expand Up @@ -354,6 +367,18 @@ void KmerIndex::BuildTranscripts(const ProgramOptions& opt, std::ofstream& out)
}
unique_names.insert(name);
target_names_.push_back(name);

// Begin Shading
auto shade_info = shadedTargetName(name);
if (shade_info.first != "") {
std::string tname = shade_info.first;
std::string variant = shade_info.second;
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
if (it != target_names_.end()) {
shadeToColorTranscriptMap[target_names_.size()-1] = std::distance(target_names_.begin(), it);
}
}
// End Shading
}
}

Expand Down Expand Up @@ -399,6 +424,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
size_t num_seqs = 0;
int max_color = 0;
u_set_<int> external_input_names;
// Begin Shading
std::set<std::string> variants_set; // Ordered set to keep track of variants (i.e. colors with shades)
// End Shading
for (int i = 0; i < opt.transfasta.size(); i++) { // Currently, this should only be one file
auto fasta = opt.transfasta[i];
fp = opt.transfasta.size() == 1 && opt.transfasta[0] == "-" ? gzdopen(fileno(stdin), "r") : gzopen(fasta.c_str(), "r");
Expand All @@ -414,9 +442,23 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
continue;
}
int color = std::atoi(strname.c_str());
// Begin Shading
std::string variant;
auto shade_info = shadedTargetName(strname);
if (shade_info.first != "") {
std::string tname = shade_info.first;
variant = shade_info.second;
color = std::atoi(tname.c_str());
variants_set.insert(std::to_string(color) + "_shade_" + variant);
}
// End Shading
external_input_names.insert(color);
if (color > max_color) max_color = color;
of << ">" << std::to_string(color) << "\n" << str << std::endl;
of << ">" << std::to_string(color);
// Begin Shading
if (!variant.empty()) of << "_shade_" << variant;
// End Shading
of << "\n" << str << std::endl;
num_seqs++;
}
gzclose(fp);
Expand All @@ -437,6 +479,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
target_names_.push_back(std::to_string(i));
target_lens_.push_back(k); // dummy length (k-mer size)
}
// Begin Shading
for (const auto& v : variants_set) {
num_trans++; // Each color-shade duo counts as an additional target
target_names_.push_back(v);
target_lens_.push_back(k); // dummy length (k-mer size)
}
if (num_trans != ncolors) {
std::cerr << "[build] Detected " << std::to_string(num_trans-ncolors) << " shades" << std::endl;
}
// End Shading

std::cerr << "[build] Building graph from k-mers" << std::endl;
BuildDeBruijnGraph(opt, tmp_file2, out);
Expand All @@ -449,6 +501,9 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
std::vector<std::vector<TRInfo> > trinfos(dbg.size());
std::ifstream infile_a(tmp_file2);
int current_color = 0;
// Begin Shading
std::string current_variant;
// End Shading
std::string line;
while (std::getline(infile_a, line)) {
if (line.length() == 0) {
Expand All @@ -458,6 +513,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
current_color = onlist_sequences.cardinality();
} else {
current_color = std::atoi(line.c_str()+1);
// Begin Shading
current_variant = "";
std::string name = line.substr(1);
auto shade_info = shadedTargetName(name);
if (shade_info.first != "") {
std::string tname = shade_info.first;
current_variant = shade_info.second;
current_color = std::atoi(tname.c_str());
}
// End Shading
}
continue;
}
Expand All @@ -481,8 +546,16 @@ void KmerIndex::BuildDistinguishingGraph(const ProgramOptions& opt, std::ofstrea
tr.pos = (proc-um.len) | (!um.strand ? sense : missense);
tr.start = um.dist;
tr.stop = um.dist + um.len;

trinfos[n->id].push_back(tr);

// Begin Shading
if (!current_variant.empty()) {
auto it = variants_set.find(std::to_string(current_color) + "_shade_" + current_variant);
assert(it != variants_set.end());
tr.trid = ncolors + std::distance(variants_set.begin(), it);
trinfos[n->id].push_back(tr);
}
// End Shading
}
}
infile_a.close();
Expand Down Expand Up @@ -995,6 +1068,13 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st
tr.stop = um.dist + um.len;

trinfos[n->id].push_back(tr);
// Begin Shading
auto it = shadeToColorTranscriptMap.find(tr.trid);
if (it != shadeToColorTranscriptMap.end()) {
tr.trid = shadeToColorTranscriptMap[tr.trid];
trinfos[n->id].push_back(tr); // Add the color of the original transcript as well
}
// End Shading
}
j++;
}
Expand All @@ -1020,6 +1100,11 @@ void KmerIndex::BuildEquivalenceClasses(const ProgramOptions& opt, const std::st

std::cerr << "[build] target de Bruijn graph has k-mer length " << dbg.getK() << " and minimizer length " << dbg.getG() << std::endl;
std::cerr << "[build] target de Bruijn graph has " << dbg.size() << " contigs and contains " << dbg.nbKmers() << " k-mers " << std::endl;
// Begin Shading
if (shadeToColorTranscriptMap.size() != 0) {
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
}
// End Shading
}

void KmerIndex::PopulateMosaicECs(std::vector<std::vector<TRInfo> >& trinfos) {
Expand Down Expand Up @@ -1418,6 +1503,19 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
in.read(buffer, tmp_size);

target_names_.push_back(std::string(buffer));
// Begin Shading
auto shade_info = shadedTargetName(target_names_[target_names_.size()-1]);
if (shade_info.first != "") {
std::string tname = shade_info.first;
std::string variant = shade_info.second;
auto it = std::find(target_names_.begin(), target_names_.end(), tname);
if (it != target_names_.end()) {
shadeToColorTranscriptMap[i] = std::distance(target_names_.begin(), it);
}
use_shade = true;
shade_sequences.add(i);
}
// End Shading
}
delete[] buffer;

Expand All @@ -1438,6 +1536,11 @@ void KmerIndex::load(ProgramOptions& opt, bool loadKmerTable, bool loadDlist) {
if (num_trans-onlist_sequences.cardinality() > 0) {
std::cerr << "[index] number of D-list k-mers: " << pretty_num(static_cast<size_t>(num_trans-onlist_sequences.cardinality())) << std::endl;
}
// Begin Shading
if (shadeToColorTranscriptMap.size() != 0) {
std::cerr << "[build] number of shades: " << std::to_string(shadeToColorTranscriptMap.size()) << std::endl;
}
// End Shading

in.close();

Expand Down Expand Up @@ -1594,6 +1697,11 @@ int KmerIndex::mapPair(const char *s1, int l1, const char *s2, int l2) const {
// post: v contains all equiv classes for the k-mers in s
void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMap<Node>, int>>& v, bool partial, bool cfc) const{
const Node* n;

// Begin Shading
if (use_shade) partial = false;
// End Shading
if (do_union) partial = false;

// TODO:
// Rework KmerIndex::match() such that it uses the following type of logic
Expand Down Expand Up @@ -1664,6 +1772,8 @@ void KmerIndex::match(const char *s, int l, std::vector<std::pair<const_UnitigMa
}

v.push_back({um, kit->second});

if (no_jump) continue;

// Find start and end of O.G. kallisto contig w.r.t. the bifrost-kallisto
// unitig
Expand Down
14 changes: 14 additions & 0 deletions src/KmerIndex.h
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ struct KmerIndex {
//LoadTranscripts(opt.transfasta);
load_positional_info = opt.bias || opt.pseudobam || opt.genomebam || !opt.single_overhang;
dfk_onlist = opt.dfk_onlist;
do_union = opt.do_union;
no_jump = opt.no_jump;
// Begin Shading
use_shade = false;
// End Shading
}

~KmerIndex() {}
Expand Down Expand Up @@ -131,6 +136,8 @@ struct KmerIndex {
std::vector<std::string> target_names_;
std::vector<std::string> target_seqs_; // populated on demand
bool dfk_onlist; // If we want to not use D-list in intersecting ECs
bool do_union; // If we want to do "pseudoalignment" via a "union" rather than an "intersection"
bool no_jump; // If we want to skip the jumping logic during pseudoalignment
bool target_seqs_loaded;
bool load_positional_info; // when should we load positional info in addition to strandedness

Expand All @@ -141,6 +148,13 @@ struct KmerIndex {
u_set_<Kmer, KmerHash> d_list;
Kmer dummy_dfk;
const_UnitigMap<Node> um_dummy;

// Begin Shading
// Here, we use the concepts of "shades" as proposed by in Ornaments by Adduri & Kim, 2024 for bias-corrected allele-specific expression estimation
std::unordered_map<int, int> shadeToColorTranscriptMap;
Roaring shade_sequences;
bool use_shade;
// End Shading
};

#endif // KALLISTO_KMERINDEX_H
Loading

0 comments on commit 0397342

Please sign in to comment.