From 70a6dd53f3641b75e4ba44ebcb696f87fe1ab465 Mon Sep 17 00:00:00 2001 From: Pall Melsted Date: Fri, 6 Feb 2015 10:28:13 -0800 Subject: [PATCH] Reformatting using astyle --- .gitignore | 1 + astyle.txt | 8 + src/EMAlgorithm.h | 264 +++++++-------- src/Kmer.cpp | 206 ++++++------ src/Kmer.hpp | 22 +- src/KmerHashTable.h | 430 ++++++++++++------------ src/KmerIndex.h | 772 +++++++++++++++++++++---------------------- src/KmerIterator.cpp | 58 ++-- src/KmerIterator.hpp | 14 +- src/MinCollector.h | 166 +++++----- src/ProcessReads.h | 144 ++++---- src/common.h | 24 +- src/hash.cpp | 176 +++++----- src/hash.hpp | 4 +- src/kseq.h | 4 +- src/main.cpp | 645 +++++++++++++++++------------------- src/weights.cpp | 0 src/weights.h | 57 ++-- 18 files changed, 1485 insertions(+), 1510 deletions(-) create mode 100644 astyle.txt delete mode 100644 src/weights.cpp diff --git a/.gitignore b/.gitignore index e39dee7f..f855116a 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,4 @@ src/kallisto src/*.a CTestTestfile.cmake *~ +*.orig diff --git a/astyle.txt b/astyle.txt new file mode 100644 index 00000000..2320d6eb --- /dev/null +++ b/astyle.txt @@ -0,0 +1,8 @@ +close-templates +add-brackets +align-reference=type +align-pointer=name +style=google +indent=spaces=2 +keep-one-line-blocks +keep-one-line-statements diff --git a/src/EMAlgorithm.h b/src/EMAlgorithm.h index f44abd99..164ab36c 100644 --- a/src/EMAlgorithm.h +++ b/src/EMAlgorithm.h @@ -14,153 +14,153 @@ const double TOLERANCE = 1e-5; template struct EMAlgorithm { - // counts is vector from collector, with indices corresponding to ec ids - // TODO: initialize alpha a bit more intelligently - // TODO: refactor to remove dependence on Index - EMAlgorithm(const ProgramOptions& opt, const Index& idx, - const std::vector& counts, - const std::vector& eff_lens, - const WeightMap& wm) : - idx_(idx), - num_trans_(idx.num_trans), - counts_(counts), - eff_lens_(eff_lens), - weight_map_(wm), - alpha_(idx.num_trans, 1.0/idx.num_trans), // uniform distribution over transcripts - rho_(idx.num_trans, 0.0), - rho_set_(false) - {} - - ~EMAlgorithm() {} - - void run(size_t n_iter = 500) { - std::vector next_alpha(alpha_.size(), 0.0); - - assert(weight_map_.size() == counts_.size()); - - double denom; - - std::cout << "[em]\tfishing for the right mixture (. = 50 rounds)" << - std::endl; - - for (auto i = 0; i < n_iter; ++i) { - if (i % 50 == 0) { - std::cout << "."; - if (i % 500 == 0 && i > 0) { - std::cout << std::endl; - } - } - - for (auto& ec_kv : idx_.ecmap ) { - denom = 0.0; - - // first, compute the denominator: a normalizer - // iterate over transcripts in EC map - auto w_search = weight_map_.find(ec_kv.first); - - // everything in ecmap should be in weight_map - assert( w_search != weight_map_.end() ); - assert( w_search->second.size() == ec_kv.second.size() ); - - for (auto t_it = 0; t_it < ec_kv.second.size(); ++t_it) { - denom += alpha_[ec_kv.second[t_it]] * w_search->second[t_it]; - } - - if (denom < TOLERANCE) { - continue; - } - - /* std::cout << "denom: " << denom << std::endl; */ - - // compute the update step - for (auto t_it = 0; t_it < ec_kv.second.size(); ++t_it) { - next_alpha[ec_kv.second[t_it]] += counts_[ec_kv.first] * - (w_search->second[t_it] * alpha_[ec_kv.second[t_it]] / denom); - } - } - - // TODO: check for relative difference for convergence in EM - - // reassign alpha_ to next_alpha - std::copy(next_alpha.begin(), next_alpha.end(), alpha_.begin()); - - // clear all next_alpha values 0 for next iteration - std::fill(next_alpha.begin(), next_alpha.end(), 0.0); - } - - std::cout << std::endl; - std::cout.flush(); - } - - void compute_rho() { - if (rho_set_) { - // rho has already been set, let's clear it - std::fill(rho_.begin(), rho_.end(), 0.0); + // counts is vector from collector, with indices corresponding to ec ids + // TODO: initialize alpha a bit more intelligently + // TODO: refactor to remove dependence on Index + EMAlgorithm(const ProgramOptions& opt, const Index& idx, + const std::vector& counts, + const std::vector& eff_lens, + const WeightMap& wm) : + idx_(idx), + num_trans_(idx.num_trans), + counts_(counts), + eff_lens_(eff_lens), + weight_map_(wm), + alpha_(idx.num_trans, 1.0/idx.num_trans), // uniform distribution over transcripts + rho_(idx.num_trans, 0.0), + rho_set_(false) + {} + + ~EMAlgorithm() {} + + void run(size_t n_iter = 500) { + std::vector next_alpha(alpha_.size(), 0.0); + + assert(weight_map_.size() == counts_.size()); + + double denom; + + std::cout << "[em]\tfishing for the right mixture (. = 50 rounds)" << + std::endl; + + for (auto i = 0; i < n_iter; ++i) { + if (i % 50 == 0) { + std::cout << "."; + if (i % 500 == 0 && i > 0) { + std::cout << std::endl; } + } - double total {0.0}; - for (auto i = 0; i < alpha_.size(); ++i) { - // TODO: consider what the right tolerance is - if (eff_lens_[i] < TOLERANCE) { - continue; - } - rho_[i] = alpha_[i] / eff_lens_[i]; - total += rho_[i]; + for (auto& ec_kv : idx_.ecmap ) { + denom = 0.0; + + // first, compute the denominator: a normalizer + // iterate over transcripts in EC map + auto w_search = weight_map_.find(ec_kv.first); + + // everything in ecmap should be in weight_map + assert( w_search != weight_map_.end() ); + assert( w_search->second.size() == ec_kv.second.size() ); + + for (auto t_it = 0; t_it < ec_kv.second.size(); ++t_it) { + denom += alpha_[ec_kv.second[t_it]] * w_search->second[t_it]; + } + + if (denom < TOLERANCE) { + continue; } - for (auto& r : rho_) { - r /= total; + /* std::cout << "denom: " << denom << std::endl; */ + + // compute the update step + for (auto t_it = 0; t_it < ec_kv.second.size(); ++t_it) { + next_alpha[ec_kv.second[t_it]] += counts_[ec_kv.first] * + (w_search->second[t_it] * alpha_[ec_kv.second[t_it]] / denom); } + } + + // TODO: check for relative difference for convergence in EM + + // reassign alpha_ to next_alpha + std::copy(next_alpha.begin(), next_alpha.end(), alpha_.begin()); - rho_set_ = true; + // clear all next_alpha values 0 for next iteration + std::fill(next_alpha.begin(), next_alpha.end(), 0.0); } - void write(const std::string& dir_out) const { - const std::string out_fname = "/expression.txt"; + std::cout << std::endl; + std::cout.flush(); + } - std::ofstream out; - out.open(dir_out + out_fname, std::ios::out); + void compute_rho() { + if (rho_set_) { + // rho has already been set, let's clear it + std::fill(rho_.begin(), rho_.end(), 0.0); + } - if (!out.is_open()) { - std::cerr << "Error opening '" << dir_out + out_fname << "'" << - std::endl; - exit(1); - } + double total {0.0}; + for (auto i = 0; i < alpha_.size(); ++i) { + // TODO: consider what the right tolerance is + if (eff_lens_[i] < TOLERANCE) { + continue; + } + rho_[i] = alpha_[i] / eff_lens_[i]; + total += rho_[i]; + } + + for (auto& r : rho_) { + r /= total; + } + + rho_set_ = true; + } - out.precision(15); - - out << - "target_id" << "\t" << - "kallisto_id" << "\t" << - "rho" << "\t" << - "tpm" << "\t" << - "est_counts" << - std::endl; - - const double MILLION = 1e6; - - for (auto i = 0; i < rho_.size(); ++i) { - out << - idx_.target_names_[i] << "\t" << - i << "\t" << - rho_[i] << "\t" << - rho_[i] * MILLION << "\t" << - alpha_[i] << + void write(const std::string& dir_out) const { + const std::string out_fname = "/expression.txt"; + + std::ofstream out; + out.open(dir_out + out_fname, std::ios::out); + + if (!out.is_open()) { + std::cerr << "Error opening '" << dir_out + out_fname << "'" << std::endl; - } + exit(1); + } - out.flush(); - out.close(); + out.precision(15); + + out << + "target_id" << "\t" << + "kallisto_id" << "\t" << + "rho" << "\t" << + "tpm" << "\t" << + "est_counts" << + std::endl; + + const double MILLION = 1e6; + + for (auto i = 0; i < rho_.size(); ++i) { + out << + idx_.target_names_[i] << "\t" << + i << "\t" << + rho_[i] << "\t" << + rho_[i] * MILLION << "\t" << + alpha_[i] << + std::endl; } - int num_trans_; - const Index &idx_; - const std::vector& counts_; - const std::vector& eff_lens_; - const WeightMap& weight_map_; - std::vector alpha_; - std::vector rho_; - bool rho_set_; + out.flush(); + out.close(); + } + + int num_trans_; + const Index& idx_; + const std::vector& counts_; + const std::vector& eff_lens_; + const WeightMap& weight_map_; + std::vector alpha_; + std::vector rho_; + bool rho_set_; }; #endif // KALLISTO_EMALGORITHM_H diff --git a/src/Kmer.cpp b/src/Kmer.cpp index 3ce1f47e..2810a431 100644 --- a/src/Kmer.cpp +++ b/src/Kmer.cpp @@ -19,38 +19,38 @@ void int2bin(uint32_t a, char *buffer, int buf_size) { */ static const uint64_t twin_table[256] = { -0xFF, 0xBF, 0x7F, 0x3F, 0xEF, 0xAF, 0x6F, 0x2F, -0xDF, 0x9F, 0x5F, 0x1F, 0xCF, 0x8F, 0x4F, 0x0F, -0xFB, 0xBB, 0x7B, 0x3B, 0xEB, 0xAB, 0x6B, 0x2B, -0xDB, 0x9B, 0x5B, 0x1B, 0xCB, 0x8B, 0x4B, 0x0B, -0xF7, 0xB7, 0x77, 0x37, 0xE7, 0xA7, 0x67, 0x27, -0xD7, 0x97, 0x57, 0x17, 0xC7, 0x87, 0x47, 0x07, -0xF3, 0xB3, 0x73, 0x33, 0xE3, 0xA3, 0x63, 0x23, -0xD3, 0x93, 0x53, 0x13, 0xC3, 0x83, 0x43, 0x03, -0xFE, 0xBE, 0x7E, 0x3E, 0xEE, 0xAE, 0x6E, 0x2E, -0xDE, 0x9E, 0x5E, 0x1E, 0xCE, 0x8E, 0x4E, 0x0E, -0xFA, 0xBA, 0x7A, 0x3A, 0xEA, 0xAA, 0x6A, 0x2A, -0xDA, 0x9A, 0x5A, 0x1A, 0xCA, 0x8A, 0x4A, 0x0A, -0xF6, 0xB6, 0x76, 0x36, 0xE6, 0xA6, 0x66, 0x26, -0xD6, 0x96, 0x56, 0x16, 0xC6, 0x86, 0x46, 0x06, -0xF2, 0xB2, 0x72, 0x32, 0xE2, 0xA2, 0x62, 0x22, -0xD2, 0x92, 0x52, 0x12, 0xC2, 0x82, 0x42, 0x02, -0xFD, 0xBD, 0x7D, 0x3D, 0xED, 0xAD, 0x6D, 0x2D, -0xDD, 0x9D, 0x5D, 0x1D, 0xCD, 0x8D, 0x4D, 0x0D, -0xF9, 0xB9, 0x79, 0x39, 0xE9, 0xA9, 0x69, 0x29, -0xD9, 0x99, 0x59, 0x19, 0xC9, 0x89, 0x49, 0x09, -0xF5, 0xB5, 0x75, 0x35, 0xE5, 0xA5, 0x65, 0x25, -0xD5, 0x95, 0x55, 0x15, 0xC5, 0x85, 0x45, 0x05, -0xF1, 0xB1, 0x71, 0x31, 0xE1, 0xA1, 0x61, 0x21, -0xD1, 0x91, 0x51, 0x11, 0xC1, 0x81, 0x41, 0x01, -0xFC, 0xBC, 0x7C, 0x3C, 0xEC, 0xAC, 0x6C, 0x2C, -0xDC, 0x9C, 0x5C, 0x1C, 0xCC, 0x8C, 0x4C, 0x0C, -0xF8, 0xB8, 0x78, 0x38, 0xE8, 0xA8, 0x68, 0x28, -0xD8, 0x98, 0x58, 0x18, 0xC8, 0x88, 0x48, 0x08, -0xF4, 0xB4, 0x74, 0x34, 0xE4, 0xA4, 0x64, 0x24, -0xD4, 0x94, 0x54, 0x14, 0xC4, 0x84, 0x44, 0x04, -0xF0, 0xB0, 0x70, 0x30, 0xE0, 0xA0, 0x60, 0x20, -0xD0, 0x90, 0x50, 0x10, 0xC0, 0x80, 0x40, 0x00 + 0xFF, 0xBF, 0x7F, 0x3F, 0xEF, 0xAF, 0x6F, 0x2F, + 0xDF, 0x9F, 0x5F, 0x1F, 0xCF, 0x8F, 0x4F, 0x0F, + 0xFB, 0xBB, 0x7B, 0x3B, 0xEB, 0xAB, 0x6B, 0x2B, + 0xDB, 0x9B, 0x5B, 0x1B, 0xCB, 0x8B, 0x4B, 0x0B, + 0xF7, 0xB7, 0x77, 0x37, 0xE7, 0xA7, 0x67, 0x27, + 0xD7, 0x97, 0x57, 0x17, 0xC7, 0x87, 0x47, 0x07, + 0xF3, 0xB3, 0x73, 0x33, 0xE3, 0xA3, 0x63, 0x23, + 0xD3, 0x93, 0x53, 0x13, 0xC3, 0x83, 0x43, 0x03, + 0xFE, 0xBE, 0x7E, 0x3E, 0xEE, 0xAE, 0x6E, 0x2E, + 0xDE, 0x9E, 0x5E, 0x1E, 0xCE, 0x8E, 0x4E, 0x0E, + 0xFA, 0xBA, 0x7A, 0x3A, 0xEA, 0xAA, 0x6A, 0x2A, + 0xDA, 0x9A, 0x5A, 0x1A, 0xCA, 0x8A, 0x4A, 0x0A, + 0xF6, 0xB6, 0x76, 0x36, 0xE6, 0xA6, 0x66, 0x26, + 0xD6, 0x96, 0x56, 0x16, 0xC6, 0x86, 0x46, 0x06, + 0xF2, 0xB2, 0x72, 0x32, 0xE2, 0xA2, 0x62, 0x22, + 0xD2, 0x92, 0x52, 0x12, 0xC2, 0x82, 0x42, 0x02, + 0xFD, 0xBD, 0x7D, 0x3D, 0xED, 0xAD, 0x6D, 0x2D, + 0xDD, 0x9D, 0x5D, 0x1D, 0xCD, 0x8D, 0x4D, 0x0D, + 0xF9, 0xB9, 0x79, 0x39, 0xE9, 0xA9, 0x69, 0x29, + 0xD9, 0x99, 0x59, 0x19, 0xC9, 0x89, 0x49, 0x09, + 0xF5, 0xB5, 0x75, 0x35, 0xE5, 0xA5, 0x65, 0x25, + 0xD5, 0x95, 0x55, 0x15, 0xC5, 0x85, 0x45, 0x05, + 0xF1, 0xB1, 0x71, 0x31, 0xE1, 0xA1, 0x61, 0x21, + 0xD1, 0x91, 0x51, 0x11, 0xC1, 0x81, 0x41, 0x01, + 0xFC, 0xBC, 0x7C, 0x3C, 0xEC, 0xAC, 0x6C, 0x2C, + 0xDC, 0x9C, 0x5C, 0x1C, 0xCC, 0x8C, 0x4C, 0x0C, + 0xF8, 0xB8, 0x78, 0x38, 0xE8, 0xA8, 0x68, 0x28, + 0xD8, 0x98, 0x58, 0x18, 0xC8, 0x88, 0x48, 0x08, + 0xF4, 0xB4, 0x74, 0x34, 0xE4, 0xA4, 0x64, 0x24, + 0xD4, 0x94, 0x54, 0x14, 0xC4, 0x84, 0x44, 0x04, + 0xF0, 0xB0, 0x70, 0x30, 0xE0, 0xA0, 0x60, 0x20, + 0xD0, 0x90, 0x50, 0x10, 0xC0, 0x80, 0x40, 0x00 }; @@ -91,8 +91,8 @@ static const uint64_t twin_table[256] = { */ // use: km = Kmer(); -// pre: -// post: the DNA string in km is AA....AAA (k times A) +// pre: +// post: the DNA string in km is AA....AAA (k times A) Kmer::Kmer() { //memset(bytes,0,MAX_K/4); for (size_t i = 0; i < MAX_K/32; i++) { @@ -103,7 +103,7 @@ Kmer::Kmer() { // use: _km = Kmer(km); // pre: s[0],...,s[k] are all equal to 'A','C','G' or 'T' -// post: the DNA string in _km and is the same as in km +// post: the DNA string in _km and is the same as in km Kmer::Kmer(const Kmer& o) { //memcpy(bytes,o.bytes,MAX_K/4); for (size_t i = 0; i < MAX_K/32; i++) { @@ -121,8 +121,8 @@ Kmer::Kmer(const char *s) { // use: _km = km; -// pre: -// post: the DNA string in _km and is the same as in km +// pre: +// post: the DNA string in _km and is the same as in km Kmer& Kmer::operator=(const Kmer& o) { if (this != &o) { for (size_t i = 0; i < MAX_K/32; i++) { @@ -135,7 +135,7 @@ Kmer& Kmer::operator=(const Kmer& o) { // use: km = Kmer(); -// pre: +// pre: // post: The last bit in the bit array which stores the DNA string has been set to 1 // which indicates that the km is invalid void Kmer::set_deleted() { @@ -144,17 +144,19 @@ void Kmer::set_deleted() { // use: b = (km1 < km2); -// pre: +// pre: // post: b is true <==> the DNA strings in km1 is alphabetically smaller than -// the DNA string in km2 +// the DNA string in km2 bool Kmer::operator<(const Kmer& o) const { bool r = false; for (size_t i = 0; i < MAX_K/32; ++i) { - if (longs[i] < o.longs[i]) + if (longs[i] < o.longs[i]) { return true; - if (longs[i] > o.longs[i]) + } + if (longs[i] > o.longs[i]) { return false; + } } return false; @@ -176,7 +178,7 @@ bool Kmer::operator<(const Kmer& o) const { // use: b = (km1 == km2); -// pre: +// pre: // post: b is true <==> the DNA strings in km1 and km2 are equal bool Kmer::operator==(const Kmer& o) const { for (size_t i = 0; i < MAX_K/32; i++) { @@ -191,16 +193,16 @@ bool Kmer::operator==(const Kmer& o) const { // use: km.set_kmer(s); // pre: s[0],...,s[k-1] are all 'A','C','G' or 'T' -// post: The DNA string in km is now equal to s +// post: The DNA string in km is now equal to s void Kmer::set_kmer(const char *s) { size_t i,j,l; memset(bytes,0,MAX_K/4); - + for (i = 0; i < k; ++i) { j = i % 32; l = i/32; assert(*s != '\0'); - + size_t x = ((*s) & 4) >> 1; longs[l] |= ((x + ((x ^ (*s & 2)) >>1)) << (2*(31-j))); /* @@ -210,24 +212,24 @@ void Kmer::set_kmer(const char *s) { case 'G': longs[l] |= (0x02 << (2*j)); break; case 'T': longs[l] |= (0x03 << (2*j)); break; }*/ - - s++; + + s++; } } // use: i = km.hash(); -// pre: -// post: i is the hash value of km +// pre: +// post: i is the hash value of km uint64_t Kmer::hash() const { uint64_t ret; - MurmurHash3_x64_64((const void*)bytes,k_bytes,0,&ret); + MurmurHash3_x64_64((const void *)bytes,k_bytes,0,&ret); return ret; } // use: rep = km.rep(); -// pre: +// pre: // post: rep is km.twin() if the DNA string in km.twin() is alphabetically smaller than // the DNA string in km, else rep is km Kmer Kmer::rep() const { @@ -237,7 +239,7 @@ Kmer Kmer::rep() const { // use: tw = km.twin(); -// pre: +// pre: // post: tw is the twin kmer with respect to km, // i.e. if the DNA string in km is 'GTCA' // then the DNA string in tw is 'TGAC' @@ -245,22 +247,22 @@ Kmer Kmer::twin() const { Kmer km(*this); size_t nlongs = (k+31)/32; - + /*cout << "debugging twin for" << endl; cout << toString() << endl; cout << getBinary() << endl; cout << "nlongs " << nlongs << endl; cout << "flipping bits" << endl;*/ - + for (size_t i = 0; i < nlongs; i++) { uint64_t v = longs[i]; - km.longs[nlongs-1-i] = - (twin_table[v & 0xFF] << 56) | - (twin_table[(v>>8) & 0xFF] << 48) | - (twin_table[(v>>16) & 0xFF] << 40) | + km.longs[nlongs-1-i] = + (twin_table[v & 0xFF] << 56) | + (twin_table[(v>>8) & 0xFF] << 48) | + (twin_table[(v>>16) & 0xFF] << 40) | (twin_table[(v>>24) & 0xFF] << 32) | - (twin_table[(v>>32) & 0xFF] << 24) | - (twin_table[(v>>40) & 0xFF] << 16) | + (twin_table[(v>>32) & 0xFF] << 24) | + (twin_table[(v>>40) & 0xFF] << 16) | (twin_table[(v>>48) & 0xFF] << 8) | (twin_table[(v>>56)]); } @@ -274,17 +276,17 @@ Kmer Kmer::twin() const { //cout << "shift: " << shift << endl; //cout << "shiftmask" << endl << bitset<64>(shiftmask) << endl; - + km.longs[0] = km.longs[0] << shift; //cout << km.getBinary() << endl; for (size_t i = 1; i < nlongs; i++) { //cout << "forloop " << i << endl; km.longs[i-1] |= (km.longs[i] & shiftmask) >> (64-shift); //cout << km.getBinary() << endl; - km.longs[i] = km.longs[i] << shift; + km.longs[i] = km.longs[i] << shift; //cout << km.getBinary() << endl; } - + /* for (size_t i = (k+31)/32; i < nlongs; i++) { km.longs[i] = 0; @@ -301,7 +303,7 @@ Kmer Kmer::twin() const { uint64_t v = ~longs[i]; // flip bits // swap 2 bits v = ((v >> 2) & 0x3333333333333333ULL) | ((v & 0x3333333333333333ULL) << 2); - // swap nibbles ... + // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F0F0F0F0FULL) | ((v & 0x0F0F0F0F0F0F0F0FULL) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8); @@ -317,7 +319,7 @@ Kmer Kmer::twin() const { for (size_t i = 0; i < k_bytes; i++) { km.bytes[i] = ~bytes[i]; } - + km.bytes[k_bytes-1] ^= ~k_modmask; km.shiftForward(8*k_bytes-2*k); uint8_t tmp; @@ -331,7 +333,7 @@ Kmer Kmer::twin() const { if ((k_bytes %2) == 1) { km.bytes[k_bytes/2] = base_swap[km.bytes[k_bytes/2]]; } - + return km; */ } @@ -346,18 +348,18 @@ Kmer Kmer::getLink(const size_t index) const { char c; switch (index % 4) { - case 0: c = 'A'; break; - case 1: c = 'C'; break; - case 2: c = 'G'; break; - case 3: c = 'T'; break; + case 0: c = 'A'; break; + case 1: c = 'C'; break; + case 2: c = 'G'; break; + case 3: c = 'T'; break; } - + return (index < 4) ? forwardBase(c) : backwardBase(c); } // use: fw = km.forwardBase(c) -// pre: +// pre: // post: fw is the forward kmer from km with last character c, // i.e. if the DNA string in km is 'ACGT' and c equals 'T' then // the DNA string in fw is 'CGTT' @@ -374,30 +376,30 @@ Kmer Kmer::forwardBase(const char b) const { km.longs[nlongs-1] |= (x + ((x ^ (b & 2)) >>1 )) << (2*(31-((k-1)%32))); return km; -/******** - km.shiftBackward(2); - km.bytes[k_bytes-1] &= Kmer::k_modmask; - - switch(b) { - case 'A': km.bytes[k_bytes-1] |= 0x00 << s; break; - case 'C': km.bytes[k_bytes-1] |= 0x01 << s; break; - case 'G': km.bytes[k_bytes-1] |= 0x02 << s; break; - case 'T': km.bytes[k_bytes-1] |= 0x03 << s; break; - } + /******** + km.shiftBackward(2); + km.bytes[k_bytes-1] &= Kmer::k_modmask; + + switch(b) { + case 'A': km.bytes[k_bytes-1] |= 0x00 << s; break; + case 'C': km.bytes[k_bytes-1] |= 0x01 << s; break; + case 'G': km.bytes[k_bytes-1] |= 0x02 << s; break; + case 'T': km.bytes[k_bytes-1] |= 0x03 << s; break; + } - return km; -*/ + return km; + */ } // use: bw = km.backwardBase(c) -// pre: +// pre: // post: bw is the backward kmer from km with first character c, // i.e. if the DNA string in km is 'ACGT' and c equals 'T' then // the DNA string in bw is 'TACG' Kmer Kmer::backwardBase(const char b) const { Kmer km(*this); - + size_t nlongs = (k+31)/32; km.longs[nlongs-1] = km.longs[nlongs-1] >>2; km.longs[nlongs-1] &= (k%32) ? (((1ULL << (2*(k%32)))-1) << 2*(32-(k%32))) : ~0ULL; @@ -432,16 +434,16 @@ Kmer Kmer::backwardBase(const char b) const { // use: km.printBinary(); -// pre: -// post: The bits in the binary representation of the +// pre: +// post: The bits in the binary representation of the // DNA string for km has been printed to stdout std::string Kmer::getBinary() const { - + size_t nlongs = MAX_K/32; std::string r; r.reserve(64*nlongs); for (size_t i = 0; i < nlongs; i++) { - r.append(std::bitset<64>(longs[i]).to_string,std::allocator >()); + r.append(std::bitset<64>(longs[i]).to_string,std::allocator>()); } return r; /* @@ -449,7 +451,7 @@ std::string Kmer::getBinary() const { int2bin(bytes[i],buff,8); printf("%s",buff); } - + printf("\n"); */ } @@ -458,18 +460,18 @@ std::string Kmer::getBinary() const { // use: km.toString(s); // pre: s has space for k+1 elements // post: s[0,...,k-1] is the DNA string for the Kmer km and s[k] = '\0' -void Kmer::toString(char * s) const { +void Kmer::toString(char *s) const { size_t i,j,l; - + for (i = 0; i < k; i++) { j = i % 32; l = i / 32; switch(((longs[l]) >> (2*(31-j)) )& 0x03 ) { - case 0x00: *s = 'A'; ++s; break; - case 0x01: *s = 'C'; ++s; break; - case 0x02: *s = 'G'; ++s; break; - case 0x03: *s = 'T'; ++s; break; + case 0x00: *s = 'A'; ++s; break; + case 0x01: *s = 'C'; ++s; break; + case 0x02: *s = 'G'; ++s; break; + case 0x03: *s = 'T'; ++s; break; } } @@ -490,8 +492,8 @@ std::string Kmer::toString() const { // if i=2 then ACGT becomes XACG and X is A,C,G or T /* void Kmer::shiftForward(int shift) { - - size_t shiftmask = + + size_t shiftmask = if (shift>0) { @@ -510,7 +512,7 @@ void Kmer::shiftForward(int shift) { */ // use: km.shiftBackward(i); -// pre: i = 2,4,6 +// pre: i = 2,4,6 // post: The DNA string in km has been shifted i/2 positions backward i.e. // if i=2 then ACGT becomes CGTX and X is A,C,G or T /* @@ -521,7 +523,7 @@ void Kmer::shiftBackward(int shift) { bytes[i] >>= shift; bytes[i] |= (uint8_t) ( bytes[i+1] << (8-shift)); } - + bytes[Kmer::k_bytes-1] >>= shift; } else { assert(0); // bad diff --git a/src/Kmer.hpp b/src/Kmer.hpp index 77820074..729793dd 100644 --- a/src/Kmer.hpp +++ b/src/Kmer.hpp @@ -2,7 +2,7 @@ #define BFG_KMER_HPP #ifndef MAX_KMER_SIZE - #define MAX_KMER_SIZE 32 +#define MAX_KMER_SIZE 32 #endif #include @@ -16,8 +16,8 @@ -/* Short description: - * - Store kmer strings by using 2 bits per base instead of 8 +/* Short description: + * - Store kmer strings by using 2 bits per base instead of 8 * - Easily return reverse complements of kmers, e.g. TTGG -> CCAA * - Easily compare kmers * - Provide hash of kmers @@ -30,10 +30,10 @@ class Kmer { Kmer(const Kmer& o); explicit Kmer(const char *s); - + Kmer& operator=(const Kmer& o); - + void set_deleted(); bool operator<(const Kmer& o) const; @@ -48,7 +48,7 @@ class Kmer { uint64_t hash() const; - + Kmer twin() const; Kmer rep() const; @@ -58,10 +58,10 @@ class Kmer { Kmer forwardBase(const char b) const; Kmer backwardBase(const char b) const; - + std::string getBinary() const; - - void toString(char * s) const; + + void toString(char *s) const; std::string toString() const; // static functions @@ -87,14 +87,14 @@ class Kmer { // private functions //void shiftForward(int shift); - + //void shiftBackward(int shift); }; struct KmerHash { - size_t operator()(const Kmer &km) const { + size_t operator()(const Kmer& km) const { return km.hash(); } }; diff --git a/src/KmerHashTable.h b/src/KmerHashTable.h index 4a1642c9..81e0ad6a 100644 --- a/src/KmerHashTable.h +++ b/src/KmerHashTable.h @@ -11,229 +11,229 @@ template struct KmerHashTable { - using value_type = std::pair; - using key_type = Kmer; - using mapped_type = T; + using value_type = std::pair; + using key_type = Kmer; + using mapped_type = T; - Hash hasher; - value_type* table; - size_t size_, pop; - value_type empty; + Hash hasher; + value_type *table; + size_t size_, pop; + value_type empty; // ---- iterator ---- - template - class iterator_ : public std::iterator { - public: - - typedef typename std::conditional::type DataStructurePointerType; - typedef typename std::conditional::type ValueReferenceType; - typedef typename std::conditional::type ValuePointerType; - - - DataStructurePointerType ht; - size_t h; - - iterator_(DataStructurePointerType ht_) : ht(ht_), h(ht_->size_) {} - iterator_(DataStructurePointerType ht_, size_t h_) : ht(ht_), h(h_) {} - - iterator_(const iterator_& o) : ht(o.ht), h(o.h) {} - iterator_& operator=(const iterator_& o) {ht=o.ht; h=o.h;} - - ValueReferenceType operator*() const {return ht->table[h];} - ValuePointerType operator->() const {return &(ht->table[h]);} - - void find_first() { - h = 0; - if (ht->table != nullptr && ht->size_>0) { - if (ht->table[h].first == ht->empty.first) { - operator++(); - } - } - } - - iterator_& operator++() { - if (h == ht->size_) { - return *this; - } - ++h; - for (; h < ht->size_; ++h) { - if (ht->table[h].first != ht->empty.first) { - break; - } - } - return *this; - } - bool operator==(const iterator_ &o) const {return (ht->table == o.ht->table) && (h == o.h);} - bool operator!=(const iterator_ &o) const {return !(this->operator==(o));} - friend class iterator_; - }; - - typedef iterator_ const_iterator; - typedef iterator_ iterator; - - - // --- hash table - - + template + class iterator_ : public std::iterator { + public: + + typedef typename std::conditional::type DataStructurePointerType; + typedef typename std::conditional::type ValueReferenceType; + typedef typename std::conditional::type ValuePointerType; + + + DataStructurePointerType ht; + size_t h; + + iterator_(DataStructurePointerType ht_) : ht(ht_), h(ht_->size_) {} + iterator_(DataStructurePointerType ht_, size_t h_) : ht(ht_), h(h_) {} + + iterator_(const iterator_& o) : ht(o.ht), h(o.h) {} + iterator_& operator=(const iterator_& o) {ht=o.ht; h=o.h;} + + ValueReferenceType operator*() const {return ht->table[h];} + ValuePointerType operator->() const {return &(ht->table[h]);} + + void find_first() { + h = 0; + if (ht->table != nullptr && ht->size_>0) { + if (ht->table[h].first == ht->empty.first) { + operator++(); + } + } + } + + iterator_& operator++() { + if (h == ht->size_) { + return *this; + } + ++h; + for (; h < ht->size_; ++h) { + if (ht->table[h].first != ht->empty.first) { + break; + } + } + return *this; + } + bool operator==(const iterator_ &o) const {return (ht->table == o.ht->table) && (h == o.h);} + bool operator!=(const iterator_ &o) const {return !(this->operator==(o));} + friend class iterator_; + }; + + typedef iterator_ const_iterator; + typedef iterator_ iterator; + + + // --- hash table + + KmerHashTable(const Hash& h = Hash() ) : hasher(h), table(nullptr), size_(0), pop(0) { - empty.first.set_deleted(); - init_table(1024); - } + empty.first.set_deleted(); + init_table(1024); + } KmerHashTable(size_t sz, const Hash& h = Hash() ) : hasher(h), table(nullptr), size_(0), pop(0) { - empty.first.set_deleted(); - init_table((size_t) (1.2*sz)); - } - - ~KmerHashTable() { - clear_table(); - } - - void clear_table() { - if (table != nullptr) { - delete[] table; - table = nullptr; - } - size_ = 0; - pop = 0; - } - - size_t size() const { - return pop; - } - - void clear() { - std::fill(table, table+size_, empty); - pop = 0; - } - - void init_table(size_t sz) { - clear_table(); - size_ = rndup(sz); - //cerr << "init table of size " << size_ << endl; - table = new value_type[size_]; - std::fill(table, table+size_, empty); - } - - iterator find(const Kmer& key) { - size_t h = hasher(key) & (size_-1); - - for (;; h = (h+1!=size_ ? h+1 : 0)) { - if (table[h].first == empty.first) { - // empty slot, insert here - return iterator(this); - } else if (table[h].first == key) { - // same key, found - return iterator(this, h); - } - } - } - - const_iterator find(const Kmer& key) const { - - size_t h = hasher(key) & (size_-1); - - for (;; h = (h+1!=size_ ? h+1 : 0)) { - if (table[h].first == empty.first) { - // empty slot, insert here - return const_iterator(this); - } else if (table[h].first == key) { - // same key, found - return const_iterator(this, h); - } - } - } - - - std::pair insert(const value_type &val) { - //cerr << "inserting " << val.first.toString() << " = " << val.second << endl; - if ((pop + (pop>>4))> size_) { // if more than 80% full - //cerr << "-- triggered resize--" << endl; - reserve(2*size_); - } - - size_t h = hasher(val.first) & (size_-1); - //cerr << " hash value = " << h << endl; - for (;; h = (h+1!=size_ ? h+1 : 0)) { - //cerr << " lookup at " << h << endl; - if (table[h].first == empty.first) { - //cerr << " found empty slot" << endl; - // empty slot, insert here - table[h] = val; - ++pop; // new table - return {iterator(this, h), true}; - } else if (table[h].first == val.first) { - // same key, update value - //cerr << " found key already here " << table[h].first.toString() << " = " << table[h].second << endl; - return {iterator(this, h), false}; - } - } - - } - - void reserve(size_t sz) { - - if (sz <= size_) { - return; - } - - value_type* old_table = table; - size_t old_size_ = size_; - - - size_ = rndup(sz); - pop = 0; - - table = new value_type[size_]; - std::fill(table, table+size_, empty); - for (size_t i = 0; i < old_size_; i++) { - if (old_table[i].first != empty.first) { - insert(old_table[i]); - } - } - delete[] old_table; - old_table = nullptr; - - } - - size_t rndup(size_t v) { - v--; - v |= v >> 1; - v |= v >> 2; - v |= v >> 4; - v |= v >> 8; - v |= v >> 16; - v |= v >> 32; - v++; - return v; - } - - iterator begin() { - iterator it(this); - it.find_first(); - return it; - } - - const_iterator begin() const { - const_iterator it(this); - it.find_first(); - return it; - } - - iterator end() { - return iterator(this); - } - - const_iterator end() const { - return const_iterator(this); - } - - - - - + empty.first.set_deleted(); + init_table((size_t) (1.2*sz)); + } + + ~KmerHashTable() { + clear_table(); + } + + void clear_table() { + if (table != nullptr) { + delete[] table; + table = nullptr; + } + size_ = 0; + pop = 0; + } + + size_t size() const { + return pop; + } + + void clear() { + std::fill(table, table+size_, empty); + pop = 0; + } + + void init_table(size_t sz) { + clear_table(); + size_ = rndup(sz); + //cerr << "init table of size " << size_ << endl; + table = new value_type[size_]; + std::fill(table, table+size_, empty); + } + + iterator find(const Kmer& key) { + size_t h = hasher(key) & (size_-1); + + for (;; h = (h+1!=size_ ? h+1 : 0)) { + if (table[h].first == empty.first) { + // empty slot, insert here + return iterator(this); + } else if (table[h].first == key) { + // same key, found + return iterator(this, h); + } + } + } + + const_iterator find(const Kmer& key) const { + + size_t h = hasher(key) & (size_-1); + + for (;; h = (h+1!=size_ ? h+1 : 0)) { + if (table[h].first == empty.first) { + // empty slot, insert here + return const_iterator(this); + } else if (table[h].first == key) { + // same key, found + return const_iterator(this, h); + } + } + } + + + std::pair insert(const value_type& val) { + //cerr << "inserting " << val.first.toString() << " = " << val.second << endl; + if ((pop + (pop>>4))> size_) { // if more than 80% full + //cerr << "-- triggered resize--" << endl; + reserve(2*size_); + } + + size_t h = hasher(val.first) & (size_-1); + //cerr << " hash value = " << h << endl; + for (;; h = (h+1!=size_ ? h+1 : 0)) { + //cerr << " lookup at " << h << endl; + if (table[h].first == empty.first) { + //cerr << " found empty slot" << endl; + // empty slot, insert here + table[h] = val; + ++pop; // new table + return {iterator(this, h), true}; + } else if (table[h].first == val.first) { + // same key, update value + //cerr << " found key already here " << table[h].first.toString() << " = " << table[h].second << endl; + return {iterator(this, h), false}; + } + } + + } + + void reserve(size_t sz) { + + if (sz <= size_) { + return; + } + + value_type *old_table = table; + size_t old_size_ = size_; + + + size_ = rndup(sz); + pop = 0; + + table = new value_type[size_]; + std::fill(table, table+size_, empty); + for (size_t i = 0; i < old_size_; i++) { + if (old_table[i].first != empty.first) { + insert(old_table[i]); + } + } + delete[] old_table; + old_table = nullptr; + + } + + size_t rndup(size_t v) { + v--; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v |= v >> 32; + v++; + return v; + } + + iterator begin() { + iterator it(this); + it.find_first(); + return it; + } + + const_iterator begin() const { + const_iterator it(this); + it.find_first(); + return it; + } + + iterator end() { + return iterator(this); + } + + const_iterator end() const { + return const_iterator(this); + } + + + + + }; #endif // KALLISTO_KMERHASHTABLE_H diff --git a/src/KmerIndex.h b/src/KmerIndex.h index 250305aa..8752532e 100644 --- a/src/KmerIndex.h +++ b/src/KmerIndex.h @@ -28,407 +28,403 @@ KSEQ_INIT(gzFile, gzread) using EcMap = std::unordered_map>; struct SortedVectorHasher { - size_t operator()(const std::vector &v) const { - uint64_t r = 0; - int i=0; - for (auto x : v) { - uint64_t t; - MurmurHash3_x64_64(&x,sizeof(x), 0,&t); - t = (x>>i) | (x<<(64-i)); - r = r ^ t; - i = (i+1)%64; - } - return r; - } + size_t operator()(const std::vector& v) const { + uint64_t r = 0; + int i=0; + for (auto x : v) { + uint64_t t; + MurmurHash3_x64_64(&x,sizeof(x), 0,&t); + t = (x>>i) | (x<<(64-i)); + r = r ^ t; + i = (i+1)%64; + } + return r; + } }; -struct KmerIndex -{ - KmerIndex(const ProgramOptions& opt) : k(opt.k), num_trans(0), skip(opt.skip) { - //LoadTranscripts(opt.transfasta); - } - - ~KmerIndex() {} - - - // use: match(s,l,v) - // pre: v is initialized - // post: v contains all equiv classes for the k-mers in s - void match(const char *s, int l, std::vector & v) const { - KmerIterator kit(s), kit_end; - for (int i = 0;kit != kit_end; ++kit,++i) { - if (i==skip) { - i=0; - } - if (i==0) { - Kmer rep = kit->first.rep(); - auto search = kmap.find(rep); - if (search != kmap.end()) { - // if k-mer found - v.push_back(search->second); // add equivalence class - } - } - } - } - - // use: res = intersect(ec,v) - // pre: ec is in ecmap, v is a vector of valid transcripts - // v is sorted in increasing order - // post: res contains the intersection of ecmap[ec] and v sorted increasing - // res is empty if ec is not in ecmap - std::vector intersect(int ec, const std::vector& v) const { - std::vector res; - auto search = ecmap.find(ec); - if (search != ecmap.end()) { - auto &u = search->second; - res.reserve(v.size()); - - auto a = u.begin(); - auto b = v.begin(); - - while (a != u.end() && b != v.end()) { - if (*a < *b) { - ++a; - } else if (*b < *a) { - ++b; - } else { - // match - res.push_back(*a); - ++a; - ++b; - } - } - } - return res; - } - - - void BuildTranscripts(const std::string& fasta) { - // TODO: add code to check if binary file exists and load it directly - // FIXME: check if FASTA file actually exists - // If it doesn't, will just hang - int l; - std::cerr << "Loading fasta file " << fasta - << std::endl; - std::cerr << "k: " << k << std::endl; - gzFile fp = gzopen(fasta.c_str(),"r"); - kseq_t *seq = kseq_init(fp); - - int transid = 0; - std::unordered_map kmcount; // temporary - - // maps kmers to set of transcript ids that contain them - std::unordered_map, KmerHash> all_kmap; - - // for each transcript in fasta file - while ((l = kseq_read(seq)) > 0) { - bool added = false; - - target_names_.push_back(seq->name.s); - - // if it is long enough - if (seq->seq.l >= k) { - KmerIterator kit(seq->seq.s), kit_end; - // for each k-mer add to map - for(;kit != kit_end; ++kit) { - Kmer rep = kit->first.rep(); - kmcount[rep]++; - auto search = all_kmap.find(rep); - if (search == all_kmap.end()) { - // new k-mer - all_kmap.insert({rep, {transid}}); - } else { - // seen before - std::vector &v = search->second; - if (*v.rbegin() < transid) { - // but new transcript - v.push_back(transid); - } - } - added = true; - } - } - if (added) { - trans_lens_.push_back(seq->seq.l); - transid++; - if (transid % 1000 == 1) { - std::cerr << " " << transid << " size of k-mer map " << all_kmap.size() << std::endl; - } - } - } - - num_trans = transid; - std::cerr << "Found " << num_trans << " transcripts" - << std::endl - << "Size of k-mer map " << all_kmap.size() << std::endl; - - - // for each transcript - for (int i = 0; i < num_trans; i++ ) { - // create its own eqs - std::vector single(1,i); - ecmap.insert({i,single}); - ecmapinv.insert({single,i}); - } - - - int eqs_id = num_trans; - - - for (auto& kv : all_kmap) { - auto search = ecmapinv.find(kv.second); - // if we have seen this equivalence class - if (search != ecmapinv.end()) { - // update kmap - kmap.insert({kv.first, search->second}); - } else { - // else create a new equivalence class and update kmap - ecmapinv.insert({kv.second,eqs_id}); - ecmap.insert({eqs_id, kv.second}); - kmap.insert({kv.first, eqs_id}); - eqs_id++; - } - } - - std::cerr << "Created " << ecmap.size() << " equivalence classes from " << num_trans << " transcripts" << std::endl; - - /* std::cout << "EqId\tTransIdList\n"; */ - /* for (auto &ekv : ecmap) { */ - /* std::cout << ekv.first; */ - /* for (auto el : ekv.second) { */ - /* std::cout << "\t" << el; */ - /* } */ - /* std::cout << "\n"; */ - /* } */ - /* std::cout.flush(); */ - - - std::cerr << "K-mer map has " << kmap.size() << " k-mers and " << std::endl; - kseq_destroy(seq); - gzclose(fp); - } - - void write(const std::string& index_out, bool writeKmerTable = true) - { - std::ofstream out; - out.open(index_out, std::ios::out | std::ios::binary); - - if (!out.is_open()) { - // TODO: better handling - std::cerr << "Error: index output file could not be opened!"; - exit(1); +struct KmerIndex { + KmerIndex(const ProgramOptions& opt) : k(opt.k), num_trans(0), skip(opt.skip) { + //LoadTranscripts(opt.transfasta); + } + + ~KmerIndex() {} + + + // use: match(s,l,v) + // pre: v is initialized + // post: v contains all equiv classes for the k-mers in s + void match(const char *s, int l, std::vector& v) const { + KmerIterator kit(s), kit_end; + for (int i = 0; kit != kit_end; ++kit,++i) { + if (i==skip) { + i=0; + } + if (i==0) { + Kmer rep = kit->first.rep(); + auto search = kmap.find(rep); + if (search != kmap.end()) { + // if k-mer found + v.push_back(search->second); // add equivalence class } - - // 1. write index - out.write((char*)&INDEX_VERSION, sizeof(INDEX_VERSION)); - - // 2. write k - out.write((char*)&k, sizeof(k)); - - // 3. write number of transcripts - out.write((char*)&num_trans, sizeof(num_trans)); - - // 4. write out transcript lengths - for (int tlen : trans_lens_) { - out.write((char*)&tlen, sizeof(tlen)); - } - - size_t kmap_size = kmap.size(); - - if (writeKmerTable) { - // 5. write number of k-mers in map - out.write((char*)&kmap_size, sizeof(kmap_size)); - - // 6. write kmer->ec values - for (auto& kv : kmap) { - out.write((char*)&kv.first, sizeof(kv.first)); - out.write((char*)&kv.second, sizeof(kv.second)); - } - } else { - // 5. write fake k-mer size - kmap_size = 0; - out.write((char*)&kmap_size, sizeof(kmap_size)); - - // 6. write none of the kmer->ec values - } - // 7. write number of equivalence classes - size_t tmp_size; - tmp_size = ecmap.size(); - out.write((char*)&tmp_size, sizeof(tmp_size)); - - // 8. write out each equiv class - for (auto& kv : ecmap) { - out.write((char*)&kv.first, sizeof(kv.first)); - - // 8.1 write out the size of equiv class - tmp_size = kv.second.size(); - out.write((char*)&tmp_size, sizeof(tmp_size)); - // 8.2 write each member - for (auto& val: kv.second) { - out.write((char*)&val, sizeof(val)); + } + } + } + + // use: res = intersect(ec,v) + // pre: ec is in ecmap, v is a vector of valid transcripts + // v is sorted in increasing order + // post: res contains the intersection of ecmap[ec] and v sorted increasing + // res is empty if ec is not in ecmap + std::vector intersect(int ec, const std::vector& v) const { + std::vector res; + auto search = ecmap.find(ec); + if (search != ecmap.end()) { + auto& u = search->second; + res.reserve(v.size()); + + auto a = u.begin(); + auto b = v.begin(); + + while (a != u.end() && b != v.end()) { + if (*a < *b) { + ++a; + } else if (*b < *a) { + ++b; + } else { + // match + res.push_back(*a); + ++a; + ++b; + } + } + } + return res; + } + + + void BuildTranscripts(const std::string& fasta) { + // TODO: add code to check if binary file exists and load it directly + // FIXME: check if FASTA file actually exists + // If it doesn't, will just hang + int l; + std::cerr << "Loading fasta file " << fasta + << std::endl; + std::cerr << "k: " << k << std::endl; + gzFile fp = gzopen(fasta.c_str(),"r"); + kseq_t *seq = kseq_init(fp); + + int transid = 0; + std::unordered_map kmcount; // temporary + + // maps kmers to set of transcript ids that contain them + std::unordered_map, KmerHash> all_kmap; + + // for each transcript in fasta file + while ((l = kseq_read(seq)) > 0) { + bool added = false; + + target_names_.push_back(seq->name.s); + + // if it is long enough + if (seq->seq.l >= k) { + KmerIterator kit(seq->seq.s), kit_end; + // for each k-mer add to map + for(; kit != kit_end; ++kit) { + Kmer rep = kit->first.rep(); + kmcount[rep]++; + auto search = all_kmap.find(rep); + if (search == all_kmap.end()) { + // new k-mer + all_kmap.insert({rep, {transid}}); + } else { + // seen before + std::vector& v = search->second; + if (*v.rbegin() < transid) { + // but new transcript + v.push_back(transid); } + } + added = true; } - - // 9. Write out target ids - // XXX: num_trans should equal to target_names_.size(), so don't need - // to write out again. - assert(num_trans == target_names_.size()); - for (auto& tid : target_names_) { - // 9.1 write out how many bytes - // XXX: Note: this doesn't actually encore the max targ id size. - // might cause problems in the future - tmp_size = tid.size(); - out.write((char*)&tmp_size, sizeof(tmp_size)); - - // 9.2 write out the actual string - out.write(tid.c_str(), tid.size()); + } + if (added) { + trans_lens_.push_back(seq->seq.l); + transid++; + if (transid % 1000 == 1) { + std::cerr << " " << transid << " size of k-mer map " << all_kmap.size() << std::endl; } + } + } + + num_trans = transid; + std::cerr << "Found " << num_trans << " transcripts" + << std::endl + << "Size of k-mer map " << all_kmap.size() << std::endl; - out.flush(); - out.close(); + + // for each transcript + for (int i = 0; i < num_trans; i++ ) { + // create its own eqs + std::vector single(1,i); + ecmap.insert({i,single}); + ecmapinv.insert({single,i}); } - // note opt is not const - void load(ProgramOptions &opt, bool loadKmerTable = true) { - - std::string& index_in = opt.index; - std::ifstream in; - - in.open(index_in, std::ios::in | std::ios::binary); - - if (!in.is_open()) { - // TODO: better handling - std::cerr << "Error: index input file could not be opened!"; - exit(1); - } - - // 1. read version - size_t header_version = 0; - in.read((char*)&header_version, sizeof(header_version)); - - if (header_version != INDEX_VERSION) { - std::cerr << "Error: Incompatiple indices. Found version " << header_version << ", expected version " << INDEX_VERSION << std::endl - << "Rerun with index to regenerate!"; - exit(1); - } - - // 2. read k - in.read((char*)&k, sizeof(k)); - if (Kmer::k == 0) { - //std::cerr << "[index] no k has been set, setting k = " << k << std::endl; - Kmer::set_k(k); - opt.k = k; - } else if (Kmer::k == k) { - //std::cerr << "[index] Kmer::k has been set and matches" << k << std::endl; - opt.k = k; - } else { - std::cerr << "Error: Kmer::k was already set to = " << Kmer::k << std::endl - << " conflicts with value of k = " << k << std::endl; - exit(1); - } - - // 3. read number of transcripts - in.read((char*)&num_trans, sizeof(num_trans)); - - // 4. read number of transcripts - trans_lens_.clear(); - trans_lens_.reserve(num_trans); - - for (int i = 0; i < num_trans; i++) { - int tlen; - in.read((char*)&tlen, sizeof(tlen)); - trans_lens_.push_back(tlen); - } - - // 5. read number of k-mers - size_t kmap_size; - in.read((char*)&kmap_size, sizeof(kmap_size)); - - std::cerr << "[index] k: " << k << std::endl; - std::cerr << "[index] num_trans read: " << num_trans << std::endl; - std::cerr << "[index] kmap size: " << kmap_size << std::endl; - - kmap.clear(); - if (loadKmerTable) { - kmap.reserve(kmap_size); - } - - // 6. read kmer->ec values - Kmer tmp_kmer; - int tmp_val; - for (size_t i = 0; i < kmap_size; ++i) - { - in.read((char*)&tmp_kmer, sizeof(tmp_kmer)); - in.read((char*)&tmp_val, sizeof(tmp_val)); - - if (loadKmerTable) { - kmap.insert({tmp_kmer, tmp_val}); - } - } - - // 7. read number of equivalence classes - size_t ecmap_size; - in.read((char*)&ecmap_size, sizeof(ecmap_size)); - - std::cerr << "[index] ecmap size: " << ecmap_size << std::endl; - - int tmp_id; - size_t vec_size; - // 8. read each equiv class - for (size_t i = 0; i < ecmap_size; ++i) { - in.read((char*)&tmp_id, sizeof(tmp_id)); - - // 8.1 read size of equiv class - in.read((char*)&vec_size, sizeof(vec_size)); - - // 8.2 read each member - std::vector tmp_vec; - tmp_vec.reserve(vec_size); - for (size_t j = 0; j < vec_size; ++j ) - { - in.read((char*)&tmp_val, sizeof(tmp_val)); - tmp_vec.push_back(tmp_val); - } - ecmap.insert({tmp_id, tmp_vec}); - ecmapinv.insert({tmp_vec, tmp_id}); - } - - // 9. read in target ids - target_names_.clear(); - target_names_.reserve(num_trans); - - size_t tmp_size; - char buffer[1024]; // if your target_name is longer than this, screw you. - for (auto i = 0; i < num_trans; ++i) { - // 9.1 read in the size - in.read((char*)&tmp_size, sizeof(tmp_size)); - - // 9.2 read in the character string - in.read(buffer, tmp_size); - - std::string tmp_targ_id( buffer ); - target_names_.push_back(std::string( buffer )); - - // clear the buffer for next string - memset(buffer,0,strlen(buffer)); - } - in.close(); - } + int eqs_id = num_trans; + + + for (auto& kv : all_kmap) { + auto search = ecmapinv.find(kv.second); + // if we have seen this equivalence class + if (search != ecmapinv.end()) { + // update kmap + kmap.insert({kv.first, search->second}); + } else { + // else create a new equivalence class and update kmap + ecmapinv.insert({kv.second,eqs_id}); + ecmap.insert({eqs_id, kv.second}); + kmap.insert({kv.first, eqs_id}); + eqs_id++; + } + } + + std::cerr << "Created " << ecmap.size() << " equivalence classes from " << num_trans << " transcripts" << std::endl; + + /* std::cout << "EqId\tTransIdList\n"; */ + /* for (auto &ekv : ecmap) { */ + /* std::cout << ekv.first; */ + /* for (auto el : ekv.second) { */ + /* std::cout << "\t" << el; */ + /* } */ + /* std::cout << "\n"; */ + /* } */ + /* std::cout.flush(); */ + + + std::cerr << "K-mer map has " << kmap.size() << " k-mers and " << std::endl; + kseq_destroy(seq); + gzclose(fp); + } + + void write(const std::string& index_out, bool writeKmerTable = true) { + std::ofstream out; + out.open(index_out, std::ios::out | std::ios::binary); + + if (!out.is_open()) { + // TODO: better handling + std::cerr << "Error: index output file could not be opened!"; + exit(1); + } + + // 1. write index + out.write((char *)&INDEX_VERSION, sizeof(INDEX_VERSION)); + + // 2. write k + out.write((char *)&k, sizeof(k)); + + // 3. write number of transcripts + out.write((char *)&num_trans, sizeof(num_trans)); + + // 4. write out transcript lengths + for (int tlen : trans_lens_) { + out.write((char *)&tlen, sizeof(tlen)); + } + + size_t kmap_size = kmap.size(); + + if (writeKmerTable) { + // 5. write number of k-mers in map + out.write((char *)&kmap_size, sizeof(kmap_size)); + + // 6. write kmer->ec values + for (auto& kv : kmap) { + out.write((char *)&kv.first, sizeof(kv.first)); + out.write((char *)&kv.second, sizeof(kv.second)); + } + } else { + // 5. write fake k-mer size + kmap_size = 0; + out.write((char *)&kmap_size, sizeof(kmap_size)); + + // 6. write none of the kmer->ec values + } + // 7. write number of equivalence classes + size_t tmp_size; + tmp_size = ecmap.size(); + out.write((char *)&tmp_size, sizeof(tmp_size)); + + // 8. write out each equiv class + for (auto& kv : ecmap) { + out.write((char *)&kv.first, sizeof(kv.first)); + + // 8.1 write out the size of equiv class + tmp_size = kv.second.size(); + out.write((char *)&tmp_size, sizeof(tmp_size)); + // 8.2 write each member + for (auto& val: kv.second) { + out.write((char *)&val, sizeof(val)); + } + } + + // 9. Write out target ids + // XXX: num_trans should equal to target_names_.size(), so don't need + // to write out again. + assert(num_trans == target_names_.size()); + for (auto& tid : target_names_) { + // 9.1 write out how many bytes + // XXX: Note: this doesn't actually encore the max targ id size. + // might cause problems in the future + tmp_size = tid.size(); + out.write((char *)&tmp_size, sizeof(tmp_size)); + + // 9.2 write out the actual string + out.write(tid.c_str(), tid.size()); + } + + out.flush(); + out.close(); + } + + // note opt is not const + void load(ProgramOptions& opt, bool loadKmerTable = true) { + + std::string& index_in = opt.index; + std::ifstream in; + + in.open(index_in, std::ios::in | std::ios::binary); + + if (!in.is_open()) { + // TODO: better handling + std::cerr << "Error: index input file could not be opened!"; + exit(1); + } + + // 1. read version + size_t header_version = 0; + in.read((char *)&header_version, sizeof(header_version)); + + if (header_version != INDEX_VERSION) { + std::cerr << "Error: Incompatiple indices. Found version " << header_version << ", expected version " << INDEX_VERSION << std::endl + << "Rerun with index to regenerate!"; + exit(1); + } + + // 2. read k + in.read((char *)&k, sizeof(k)); + if (Kmer::k == 0) { + //std::cerr << "[index] no k has been set, setting k = " << k << std::endl; + Kmer::set_k(k); + opt.k = k; + } else if (Kmer::k == k) { + //std::cerr << "[index] Kmer::k has been set and matches" << k << std::endl; + opt.k = k; + } else { + std::cerr << "Error: Kmer::k was already set to = " << Kmer::k << std::endl + << " conflicts with value of k = " << k << std::endl; + exit(1); + } + + // 3. read number of transcripts + in.read((char *)&num_trans, sizeof(num_trans)); + + // 4. read number of transcripts + trans_lens_.clear(); + trans_lens_.reserve(num_trans); + + for (int i = 0; i < num_trans; i++) { + int tlen; + in.read((char *)&tlen, sizeof(tlen)); + trans_lens_.push_back(tlen); + } + + // 5. read number of k-mers + size_t kmap_size; + in.read((char *)&kmap_size, sizeof(kmap_size)); + + std::cerr << "[index] k: " << k << std::endl; + std::cerr << "[index] num_trans read: " << num_trans << std::endl; + std::cerr << "[index] kmap size: " << kmap_size << std::endl; + + kmap.clear(); + if (loadKmerTable) { + kmap.reserve(kmap_size); + } + + // 6. read kmer->ec values + Kmer tmp_kmer; + int tmp_val; + for (size_t i = 0; i < kmap_size; ++i) { + in.read((char *)&tmp_kmer, sizeof(tmp_kmer)); + in.read((char *)&tmp_val, sizeof(tmp_val)); + + if (loadKmerTable) { + kmap.insert({tmp_kmer, tmp_val}); + } + } + + // 7. read number of equivalence classes + size_t ecmap_size; + in.read((char *)&ecmap_size, sizeof(ecmap_size)); + + std::cerr << "[index] ecmap size: " << ecmap_size << std::endl; + + int tmp_id; + size_t vec_size; + // 8. read each equiv class + for (size_t i = 0; i < ecmap_size; ++i) { + in.read((char *)&tmp_id, sizeof(tmp_id)); + + // 8.1 read size of equiv class + in.read((char *)&vec_size, sizeof(vec_size)); + + // 8.2 read each member + std::vector tmp_vec; + tmp_vec.reserve(vec_size); + for (size_t j = 0; j < vec_size; ++j ) { + in.read((char *)&tmp_val, sizeof(tmp_val)); + tmp_vec.push_back(tmp_val); + } + ecmap.insert({tmp_id, tmp_vec}); + ecmapinv.insert({tmp_vec, tmp_id}); + } + + // 9. read in target ids + target_names_.clear(); + target_names_.reserve(num_trans); + + size_t tmp_size; + char buffer[1024]; // if your target_name is longer than this, screw you. + for (auto i = 0; i < num_trans; ++i) { + // 9.1 read in the size + in.read((char *)&tmp_size, sizeof(tmp_size)); + + // 9.2 read in the character string + in.read(buffer, tmp_size); + + std::string tmp_targ_id( buffer ); + target_names_.push_back(std::string( buffer )); + + // clear the buffer for next string + memset(buffer,0,strlen(buffer)); + } + + in.close(); + } + + int k; // k-mer size used + int num_trans; // number of transcripts + int skip; + //std::unordered_map kmap; + KmerHashTable kmap; - int k; // k-mer size used - int num_trans; // number of transcripts - int skip; - //std::unordered_map kmap; - KmerHashTable kmap; - - EcMap ecmap; - std::unordered_map, int, SortedVectorHasher> ecmapinv; - const size_t INDEX_VERSION = 4; // increase this every time you change the fileformat + EcMap ecmap; + std::unordered_map, int, SortedVectorHasher> ecmapinv; + const size_t INDEX_VERSION = 4; // increase this every time you change the fileformat - std::vector trans_lens_; + std::vector trans_lens_; - std::vector target_names_; + std::vector target_names_; }; #endif // KALLISTO_KMERINDEX_H diff --git a/src/KmerIterator.cpp b/src/KmerIterator.cpp index 0a089934..0f053fb3 100644 --- a/src/KmerIterator.cpp +++ b/src/KmerIterator.cpp @@ -7,7 +7,7 @@ /* Note: That an iter is exhausted means that (iter._invalid == true) */ // use: ++iter; -// pre: +// pre: // post: *iter is now exhausted // OR *iter is the next valid pair of kmer and location KmerIterator& KmerIterator::operator++() { @@ -26,17 +26,17 @@ KmerIterator& KmerIterator::operator++() { // use: iter++; -// pre: +// pre: // post: iter has been incremented by one KmerIterator KmerIterator::operator++(int) { - KmerIterator tmp(*this); - operator++(); + KmerIterator tmp(*this); + operator++(); return tmp; } // use: val = (a == b); -// pre: +// pre: // post: (val == true) if a and b are both exhausted // OR a and b are in the same location of the same string. // (val == false) otherwise. @@ -50,18 +50,18 @@ bool KmerIterator::operator==(const KmerIterator& o) { // use: p = *iter; -// pre: +// pre: // post: p is NULL or a pair of Kmer and int std::pair& KmerIterator::operator*() { return p_; } -// use: example 1: km = iter->first; +// use: example 1: km = iter->first; // example 2: i = iter->second; // pre: *iter is not NULL // post: km will be (*iter).first, i will be (*iter).second -std::pair* KmerIterator::operator->() { +std::pair *KmerIterator::operator->() { return &(operator*()); } @@ -69,7 +69,7 @@ std::pair* KmerIterator::operator->() { // use: iter.raise(km, rep); // post: iter has been incremented by one // if iter is not invalid, km is iter->first and rep is km.rep() -void KmerIterator::raise(Kmer &km, Kmer &rep) { +void KmerIterator::raise(Kmer& km, Kmer& rep) { operator++(); if (!invalid_) { km = p_.first; @@ -77,8 +77,8 @@ void KmerIterator::raise(Kmer &km, Kmer &rep) { } } -// use: find_next(i,j, last_valid); -// pre: +// use: find_next(i,j, last_valid); +// pre: // post: *iter is either invalid or is a pair of: // 1) the next valid kmer in the string that does not have any 'N' // 2) the location of that kmer in the string @@ -87,29 +87,29 @@ void KmerIterator::find_next(size_t i, size_t j, bool last_valid) { ++j; while (s_[j] != 0) { - char c = s_[j]; - if (c == 'A' || c == 'C' || c == 'G' || c == 'T') { - if (last_valid) { - p_.first = p_.first.forwardBase(c); - break; // default case, - } else { - if (i + Kmer::k - 1 == j) { - p_.first = Kmer(s_+i); - last_valid = true; - break; // create k-mer from scratch - } else { - ++j; - } - } + char c = s_[j]; + if (c == 'A' || c == 'C' || c == 'G' || c == 'T') { + if (last_valid) { + p_.first = p_.first.forwardBase(c); + break; // default case, } else { + if (i + Kmer::k - 1 == j) { + p_.first = Kmer(s_+i); + last_valid = true; + break; // create k-mer from scratch + } else { ++j; - i = j; - last_valid = false; + } } + } else { + ++j; + i = j; + last_valid = false; + } } if (i+Kmer::k-1 == j && s_[j] != 0) { - p_.second = i; + p_.second = i; } else { - invalid_ = true; + invalid_ = true; } } diff --git a/src/KmerIterator.hpp b/src/KmerIterator.hpp index f90dd60e..b9705354 100644 --- a/src/KmerIterator.hpp +++ b/src/KmerIterator.hpp @@ -5,30 +5,30 @@ #include "Kmer.hpp" -/* Short description: +/* Short description: * - Easily iterate through kmers in a read * - If the read contains any N, then the N is skipped and checked whether * there is a kmer to the right of the N * */ class KmerIterator : public std::iterator, int> { -public: + public: KmerIterator() : s_(NULL), p_(), invalid_(true) {} - KmerIterator(const char* s) : s_(s), p_(), invalid_(false) { find_next(-1,-1,false);} + KmerIterator(const char *s) : s_(s), p_(), invalid_(false) { find_next(-1,-1,false);} KmerIterator(const KmerIterator& o) : s_(o.s_), p_(o.p_), invalid_(o.invalid_) {} KmerIterator& operator++(); KmerIterator operator++(int); - void raise(Kmer &km, Kmer &rep); + void raise(Kmer& km, Kmer& rep); bool operator==(const KmerIterator& o); bool operator!=(const KmerIterator& o) { return !this->operator==(o);} std::pair& operator*(); - std::pair* operator->(); + std::pair *operator->(); -private: + private: void find_next(size_t i, size_t j, bool last_valid); - + const char *s_; std::pair p_; bool invalid_; diff --git a/src/MinCollector.h b/src/MinCollector.h index e351473e..c42ae535 100644 --- a/src/MinCollector.h +++ b/src/MinCollector.h @@ -12,89 +12,89 @@ template struct MinCollector { - -MinCollector(Index &ind, const ProgramOptions& opt) : index(ind), counts(index.ecmap.size(), 0) {} - - - - void collect(std::vector& v) { - if (v.empty()) { - return; - } - sort(v.begin(), v.end()); // sort by increasing order - - int count = 1; // how many k-mer support the ec - std::vector u = index.ecmap[v[0]]; - - for (int i = 1; i < v.size(); i++) { - if (v[i] != v[i-1]) { - u = index.intersect(v[i],u); - if (u.empty()) { - break; - } - } - count++; // increase the count - } - // if u is empty do nothing - if (u.empty()) { - return; - } - - auto search = index.ecmapinv.find(u); - if (search != index.ecmapinv.end()) { - // ec class already exists, update count - ++counts[search->second]; - } else { - // new ec class, update the index and count - auto necs = counts.size(); - index.ecmap.insert({necs,u}); - index.ecmapinv.insert({u,necs}); - counts.push_back(1); - } - } - - void write(std::ostream& o) { - for (int id = 0; id < counts.size(); id++) { - o << id << "\t" << counts[id] << "\n"; - } - } - - void loadCounts(ProgramOptions& opt) { - int num_ecs = counts.size(); - counts.clear(); - std::ifstream in((opt.output + "/counts.txt")); - int i = 0; - if (in.is_open()) { - std::string line; - while (getline(in, line)) { - std::stringstream ss(line); - int j,c; - ss >> j; - ss >> c; - if (j != i) { - std::cerr << "Error: equivalence class does not match index. Found " - << j << ", expected " << i << std::endl; - exit(1); - } - counts.push_back(c); - i++; - } - - if (i != num_ecs) { - std::cerr << "Error: number of equivalence classes does not match index. Found " - << i << ", expected " << num_ecs << std::endl; - exit(1); - } - } else { - std::cerr << "Error: Could not open file " << opt.output << "/counts.txt" << std::endl; - exit(1); - - } - } - - Index &index; - std::vector counts; - + + MinCollector(Index& ind, const ProgramOptions& opt) : index(ind), counts(index.ecmap.size(), 0) {} + + + + void collect(std::vector& v) { + if (v.empty()) { + return; + } + sort(v.begin(), v.end()); // sort by increasing order + + int count = 1; // how many k-mer support the ec + std::vector u = index.ecmap[v[0]]; + + for (int i = 1; i < v.size(); i++) { + if (v[i] != v[i-1]) { + u = index.intersect(v[i],u); + if (u.empty()) { + break; + } + } + count++; // increase the count + } + // if u is empty do nothing + if (u.empty()) { + return; + } + + auto search = index.ecmapinv.find(u); + if (search != index.ecmapinv.end()) { + // ec class already exists, update count + ++counts[search->second]; + } else { + // new ec class, update the index and count + auto necs = counts.size(); + index.ecmap.insert({necs,u}); + index.ecmapinv.insert({u,necs}); + counts.push_back(1); + } + } + + void write(std::ostream& o) { + for (int id = 0; id < counts.size(); id++) { + o << id << "\t" << counts[id] << "\n"; + } + } + + void loadCounts(ProgramOptions& opt) { + int num_ecs = counts.size(); + counts.clear(); + std::ifstream in((opt.output + "/counts.txt")); + int i = 0; + if (in.is_open()) { + std::string line; + while (getline(in, line)) { + std::stringstream ss(line); + int j,c; + ss >> j; + ss >> c; + if (j != i) { + std::cerr << "Error: equivalence class does not match index. Found " + << j << ", expected " << i << std::endl; + exit(1); + } + counts.push_back(c); + i++; + } + + if (i != num_ecs) { + std::cerr << "Error: number of equivalence classes does not match index. Found " + << i << ", expected " << num_ecs << std::endl; + exit(1); + } + } else { + std::cerr << "Error: Could not open file " << opt.output << "/counts.txt" << std::endl; + exit(1); + + } + } + + Index& index; + std::vector counts; + }; #endif // KALLISTO_MINCOLLECTOR_H diff --git a/src/ProcessReads.h b/src/ProcessReads.h index 74d71a07..4f7849da 100644 --- a/src/ProcessReads.h +++ b/src/ProcessReads.h @@ -13,78 +13,78 @@ template TranscriptCollector ProcessReads(Index& index, const ProgramOptions& opt) { - - // need to receive an index map - std::ios_base::sync_with_stdio(false); - - - bool paired = (opt.files.size() == 2); - - gzFile fp1 = 0, fp2 = 0; - kseq_t *seq1 = 0, *seq2; - std::vector v; - v.reserve(1000); - - int l1,l2; // length of read - size_t nreads = 0; - - TranscriptCollector tc(index, opt); - - // for each file - - fp1 = gzopen(opt.files[0].c_str(), "r"); - seq1 = kseq_init(fp1); - if (paired) { - fp2 = gzopen(opt.files[1].c_str(),"r"); - seq2 = kseq_init(fp2); - } - - - // for each read - while (true) { - l1 = kseq_read(seq1); - if (paired) { - l2 = kseq_read(seq2); - } - if (l1 <= 0) { - break; - } - if (paired && l2 <= 0) { - break; - } - - nreads++; - v.clear(); - // process read - index.match(seq1->seq.s, seq1->seq.l, v); - if (paired) { - index.match(seq2->seq.s, seq2->seq.l, v); - } - - // collect the transcript information - tc.collect(v); - if (opt.verbose && nreads % 10000 == 0 ) { - std::cerr << "Processed " << nreads << std::endl; - } - } - gzclose(fp1); - if (paired) { - gzclose(fp2); - } - - kseq_destroy(seq1); - if (paired) { - kseq_destroy(seq2); - } - - // write output to outdir - std::string outfile = opt.output + "/counts.txt"; // figure out filenaming scheme - std::ofstream of; - of.open(outfile.c_str(), std::ios::out); - tc.write(of); - of.close(); - - return tc; + + // need to receive an index map + std::ios_base::sync_with_stdio(false); + + + bool paired = (opt.files.size() == 2); + + gzFile fp1 = 0, fp2 = 0; + kseq_t *seq1 = 0, *seq2; + std::vector v; + v.reserve(1000); + + int l1,l2; // length of read + size_t nreads = 0; + + TranscriptCollector tc(index, opt); + + // for each file + + fp1 = gzopen(opt.files[0].c_str(), "r"); + seq1 = kseq_init(fp1); + if (paired) { + fp2 = gzopen(opt.files[1].c_str(),"r"); + seq2 = kseq_init(fp2); + } + + + // for each read + while (true) { + l1 = kseq_read(seq1); + if (paired) { + l2 = kseq_read(seq2); + } + if (l1 <= 0) { + break; + } + if (paired && l2 <= 0) { + break; + } + + nreads++; + v.clear(); + // process read + index.match(seq1->seq.s, seq1->seq.l, v); + if (paired) { + index.match(seq2->seq.s, seq2->seq.l, v); + } + + // collect the transcript information + tc.collect(v); + if (opt.verbose && nreads % 10000 == 0 ) { + std::cerr << "Processed " << nreads << std::endl; + } + } + gzclose(fp1); + if (paired) { + gzclose(fp2); + } + + kseq_destroy(seq1); + if (paired) { + kseq_destroy(seq2); + } + + // write output to outdir + std::string outfile = opt.output + "/counts.txt"; // figure out filenaming scheme + std::ofstream of; + of.open(outfile.c_str(), std::ios::out); + tc.write(of); + of.close(); + + return tc; } diff --git a/src/common.h b/src/common.h index aabcf409..443ecbcd 100644 --- a/src/common.h +++ b/src/common.h @@ -12,18 +12,18 @@ struct ProgramOptions { - bool verbose; - int threads; - std::string index; - int k; - int iterations; - std::string output; - int skip; - size_t seed; - std::string transfasta; - std::vector files; - -ProgramOptions() : verbose(false), seed(0), threads(1), k(21), iterations(500), skip(1) {} + bool verbose; + int threads; + std::string index; + int k; + int iterations; + std::string output; + int skip; + size_t seed; + std::string transfasta; + std::vector files; + + ProgramOptions() : verbose(false), seed(0), threads(1), k(21), iterations(500), skip(1) {} }; #endif // KALLISTO_COMMON_H diff --git a/src/hash.cpp b/src/hash.cpp index 39ae90f3..d52eee30 100644 --- a/src/hash.cpp +++ b/src/hash.cpp @@ -6,49 +6,49 @@ uint64_t inline _rotl64(uint64_t value, int8_t amount) { return ((value) << (amount)) | ((value) >> (64 - (amount))); } -uint32_t SuperFastHash (const char * data, int len) { -uint32_t hash = len, tmp; -int rem; - - if (len <= 0 || data == NULL) return 0; - - rem = len & 3; - len >>= 2; - - /* Main loop */ - for (;len > 0; len--) { - hash += get16bits (data); - tmp = (get16bits (data+2) << 11) ^ hash; - hash = (hash << 16) ^ tmp; - data += 2*sizeof (uint16_t); - hash += hash >> 11; - } - - /* Handle end cases */ - switch (rem) { - case 3: hash += get16bits (data); - hash ^= hash << 16; - hash ^= data[sizeof (uint16_t)] << 18; - hash += hash >> 11; - break; - case 2: hash += get16bits (data); - hash ^= hash << 11; - hash += hash >> 17; - break; - case 1: hash += *data; - hash ^= hash << 10; - hash += hash >> 1; - } - - /* Force "avalanching" of final 127 bits */ - hash ^= hash << 3; - hash += hash >> 5; - hash ^= hash << 4; +uint32_t SuperFastHash (const char *data, int len) { + uint32_t hash = len, tmp; + int rem; + + if (len <= 0 || data == NULL) { return 0; } + + rem = len & 3; + len >>= 2; + + /* Main loop */ + for (; len > 0; len--) { + hash += get16bits (data); + tmp = (get16bits (data+2) << 11) ^ hash; + hash = (hash << 16) ^ tmp; + data += 2*sizeof (uint16_t); + hash += hash >> 11; + } + + /* Handle end cases */ + switch (rem) { + case 3: hash += get16bits (data); + hash ^= hash << 16; + hash ^= data[sizeof (uint16_t)] << 18; + hash += hash >> 11; + break; + case 2: hash += get16bits (data); + hash ^= hash << 11; hash += hash >> 17; - hash ^= hash << 25; - hash += hash >> 6; - - return hash; + break; + case 1: hash += *data; + hash ^= hash << 10; + hash += hash >> 1; + } + + /* Force "avalanching" of final 127 bits */ + hash ^= hash << 3; + hash += hash >> 5; + hash ^= hash << 4; + hash += hash >> 17; + hash ^= hash << 25; + hash += hash >> 6; + + return hash; } @@ -58,25 +58,23 @@ int rem; // Block read - if your platform needs to do endian-swapping or can only // handle aligned reads, do the conversion here -inline uint64_t getblock ( const uint64_t * p, int i ) -{ +inline uint64_t getblock ( const uint64_t *p, int i ) { return p[i]; } //---------- // Block mix - combine the key bits with the hash bits and scramble everything -inline void bmix64 ( uint64_t & h1, uint64_t & h2, uint64_t & k1, uint64_t & k2, uint64_t & c1, uint64_t & c2 ) -{ - k1 *= c1; - k1 = _rotl64(k1,23); +inline void bmix64 ( uint64_t& h1, uint64_t& h2, uint64_t& k1, uint64_t& k2, uint64_t& c1, uint64_t& c2 ) { + k1 *= c1; + k1 = _rotl64(k1,23); k1 *= c2; h1 ^= k1; h1 += h2; h2 = _rotl64(h2,41); - k2 *= c2; + k2 *= c2; k2 = _rotl64(k2,23); k2 *= c1; h2 ^= k2; @@ -92,8 +90,7 @@ inline void bmix64 ( uint64_t & h1, uint64_t & h2, uint64_t & k1, uint64_t & k2, //---------- // Finalization mix - avalanches all bits to within 0.05% bias -inline uint64_t fmix64 ( uint64_t k ) -{ +inline uint64_t fmix64 ( uint64_t k ) { k ^= k >> 33; k *= 0xff51afd7ed558ccd; k ^= k >> 33; @@ -103,9 +100,8 @@ inline uint64_t fmix64 ( uint64_t k ) return k; } -void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, void * out ) -{ - const uint8_t * data = (const uint8_t*)key; +void MurmurHash3_x64_128 ( const void *key, const int len, const uint32_t seed, void *out ) { + const uint8_t *data = (const uint8_t *)key; const int nblocks = len / 16; uint64_t h1 = 0x9368e53c2f6af274 ^ seed; @@ -117,44 +113,42 @@ void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, //---------- // body - const uint64_t * blocks = (const uint64_t *)(data); + const uint64_t *blocks = (const uint64_t *)(data); - for(int i = 0; i < nblocks; i++) - { - uint64_t k1 = getblock(blocks,i*2+0); - uint64_t k2 = getblock(blocks,i*2+1); + for(int i = 0; i < nblocks; i++) { + uint64_t k1 = getblock(blocks,i*2+0); + uint64_t k2 = getblock(blocks,i*2+1); - bmix64(h1,h2,k1,k2,c1,c2); - } + bmix64(h1,h2,k1,k2,c1,c2); + } //---------- // tail - const uint8_t * tail = (const uint8_t*)(data + nblocks*16); + const uint8_t *tail = (const uint8_t *)(data + nblocks*16); uint64_t k1 = 0; uint64_t k2 = 0; - switch(len & 15) - { - case 15: k2 ^= uint64_t(tail[14]) << 48; - case 14: k2 ^= uint64_t(tail[13]) << 40; - case 13: k2 ^= uint64_t(tail[12]) << 32; - case 12: k2 ^= uint64_t(tail[11]) << 24; - case 11: k2 ^= uint64_t(tail[10]) << 16; - case 10: k2 ^= uint64_t(tail[ 9]) << 8; - case 9: k2 ^= uint64_t(tail[ 8]) << 0; - - case 8: k1 ^= uint64_t(tail[ 7]) << 56; - case 7: k1 ^= uint64_t(tail[ 6]) << 48; - case 6: k1 ^= uint64_t(tail[ 5]) << 40; - case 5: k1 ^= uint64_t(tail[ 4]) << 32; - case 4: k1 ^= uint64_t(tail[ 3]) << 24; - case 3: k1 ^= uint64_t(tail[ 2]) << 16; - case 2: k1 ^= uint64_t(tail[ 1]) << 8; - case 1: k1 ^= uint64_t(tail[ 0]) << 0; - bmix64(h1,h2,k1,k2,c1,c2); - }; + switch(len & 15) { + case 15: k2 ^= uint64_t(tail[14]) << 48; + case 14: k2 ^= uint64_t(tail[13]) << 40; + case 13: k2 ^= uint64_t(tail[12]) << 32; + case 12: k2 ^= uint64_t(tail[11]) << 24; + case 11: k2 ^= uint64_t(tail[10]) << 16; + case 10: k2 ^= uint64_t(tail[ 9]) << 8; + case 9: k2 ^= uint64_t(tail[ 8]) << 0; + + case 8: k1 ^= uint64_t(tail[ 7]) << 56; + case 7: k1 ^= uint64_t(tail[ 6]) << 48; + case 6: k1 ^= uint64_t(tail[ 5]) << 40; + case 5: k1 ^= uint64_t(tail[ 4]) << 32; + case 4: k1 ^= uint64_t(tail[ 3]) << 24; + case 3: k1 ^= uint64_t(tail[ 2]) << 16; + case 2: k1 ^= uint64_t(tail[ 1]) << 8; + case 1: k1 ^= uint64_t(tail[ 0]) << 0; + bmix64(h1,h2,k1,k2,c1,c2); + }; //---------- // finalization @@ -170,32 +164,30 @@ void MurmurHash3_x64_128 ( const void * key, const int len, const uint32_t seed, h1 += h2; h2 += h1; - ((uint64_t*)out)[0] = h1; - ((uint64_t*)out)[1] = h2; + ((uint64_t *)out)[0] = h1; + ((uint64_t *)out)[1] = h2; } //----------------------------------------------------------------------------- -// If we need a smaller hash value, it's faster to just use a portion of the +// If we need a smaller hash value, it's faster to just use a portion of the // 128-bit hash -void MurmurHash3_x64_32 ( const void * key, int len, uint32_t seed, void * out ) -{ +void MurmurHash3_x64_32 ( const void *key, int len, uint32_t seed, void *out ) { uint32_t temp[4]; MurmurHash3_x64_128(key,len,seed,temp); - *(uint32_t*)out = temp[0]; + *(uint32_t *)out = temp[0]; } //---------- -void MurmurHash3_x64_64 ( const void * key, int len, uint32_t seed, void * out ) -{ +void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ) { uint64_t temp[2]; MurmurHash3_x64_128(key,len,seed,temp); - *(uint64_t*)out = temp[0]; -} + *(uint64_t *)out = temp[0]; +} //----------------------------------------------------------------------------- diff --git a/src/hash.hpp b/src/hash.hpp index 0b33f477..2f83c6c7 100644 --- a/src/hash.hpp +++ b/src/hash.hpp @@ -13,9 +13,9 @@ +(uint32_t)(((const uint8_t *)(d))[0]) ) #endif -uint32_t SuperFastHash (const char * data, int len); +uint32_t SuperFastHash (const char *data, int len); //void MurmurHash3_x64_32 ( const void * key, int len, uint32_t seed, void * out ); -void MurmurHash3_x64_64 ( const void * key, int len, uint32_t seed, void * out ); +void MurmurHash3_x64_64 ( const void *key, int len, uint32_t seed, void *out ); #endif diff --git a/src/kseq.h b/src/kseq.h index b2238d1d..9645bc43 100644 --- a/src/kseq.h +++ b/src/kseq.h @@ -78,8 +78,8 @@ #ifndef KSTRING_T #define KSTRING_T kstring_t typedef struct __kstring_t { - size_t l, m; - char *s; + size_t l, m; + char *s; } kstring_t; #endif diff --git a/src/main.cpp b/src/main.cpp index b375c4c7..f864a056 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,23 +27,22 @@ KSEQ_INIT(gzFile, gzread) using namespace std; -void ParseOptionsIndex(int argc, char **argv, ProgramOptions &opt) { +void ParseOptionsIndex(int argc, char **argv, ProgramOptions& opt) { int verbose_flag = 0; - const char* opt_string = "i:k:f:"; - static struct option long_options[] = - { - // long args + const char *opt_string = "i:k:f:"; + static struct option long_options[] = { + // long args {"verbose", no_argument, &verbose_flag, 1}, - // short args - {"index", required_argument, 0, 'i'}, + // short args + {"index", required_argument, 0, 'i'}, {"kmer-size", required_argument, 0, 'k'}, - {"trans-fasta", required_argument, 0, 'f'}, + {"trans-fasta", required_argument, 0, 'f'}, {0,0,0,0} }; - int c; + int c; int option_index = 0; - while (true) { + while (true) { c = getopt_long(argc,argv,opt_string, long_options, &option_index); if (c == -1) { @@ -53,21 +52,18 @@ void ParseOptionsIndex(int argc, char **argv, ProgramOptions &opt) { switch (c) { case 0: break; - case 'i': - { - opt.index = optarg; - break; - } - case 'k': - { + case 'i': { + opt.index = optarg; + break; + } + case 'k': { stringstream(optarg) >> opt.k; break; - } - case 'f': - { - opt.transfasta = optarg; - break; - } + } + case 'f': { + opt.transfasta = optarg; + break; + } default: break; } } @@ -78,25 +74,24 @@ void ParseOptionsIndex(int argc, char **argv, ProgramOptions &opt) { } -void ParseOptionsEM(int argc, char **argv, ProgramOptions &opt) { +void ParseOptionsEM(int argc, char **argv, ProgramOptions& opt) { int verbose_flag = 0; - const char* opt_string = "t:i:s:o:n:"; - static struct option long_options[] = - { - // long args + const char *opt_string = "t:i:s:o:n:"; + static struct option long_options[] = { + // long args {"verbose", no_argument, &verbose_flag, 1}, - // short args + // short args {"threads", required_argument, 0, 't'}, - {"index", required_argument, 0, 'i'}, + {"index", required_argument, 0, 'i'}, {"skip", required_argument, 0, 's'}, {"output-dir", required_argument, 0, 'o'}, - {"iterations", required_argument, 0, 'n'}, + {"iterations", required_argument, 0, 'n'}, {0,0,0,0} }; - int c; + int c; int option_index = 0; - while (true) { + while (true) { c = getopt_long(argc,argv,opt_string, long_options, &option_index); if (c == -1) { @@ -106,31 +101,26 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions &opt) { switch (c) { case 0: break; - case 't': - { - stringstream(optarg) >> opt.threads; + case 't': { + stringstream(optarg) >> opt.threads; + break; + } + case 'i': { + opt.index = optarg; break; - } - case 'i': - { - opt.index = optarg; - break; - } - case 's': - { - stringstream(optarg) >> opt.skip; + } + case 's': { + stringstream(optarg) >> opt.skip; break; - } - case 'o': - { + } + case 'o': { opt.output = optarg; break; - } - case 'n': - { - stringstream(optarg) >> opt.iterations; - break; - } + } + case 'n': { + stringstream(optarg) >> opt.iterations; + break; + } default: break; } } @@ -145,24 +135,23 @@ void ParseOptionsEM(int argc, char **argv, ProgramOptions &opt) { } } -void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions &opt) { +void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions& opt) { int verbose_flag = 0; - const char* opt_string = "t:s:o:n:"; - static struct option long_options[] = - { - // long args + const char *opt_string = "t:s:o:n:"; + static struct option long_options[] = { + // long args {"verbose", no_argument, &verbose_flag, 1}, - // short args + // short args {"threads", required_argument, 0, 't'}, {"seed", required_argument, 0, 's'}, {"output-dir", required_argument, 0, 'o'}, - {"iterations", required_argument, 0, 'n'}, + {"iterations", required_argument, 0, 'n'}, {0,0,0,0} }; - int c; + int c; int option_index = 0; - while (true) { + while (true) { c = getopt_long(argc,argv,opt_string, long_options, &option_index); if (c == -1) { @@ -172,26 +161,22 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions &opt) { switch (c) { case 0: break; - case 't': - { - stringstream(optarg) >> opt.threads; + case 't': { + stringstream(optarg) >> opt.threads; break; - } - case 's': - { - stringstream(optarg) >> opt.seed; + } + case 's': { + stringstream(optarg) >> opt.seed; break; - } - case 'o': - { + } + case 'o': { opt.output = optarg; break; - } - case 'n': - { - stringstream(optarg) >> opt.iterations; - break; - } + } + case 'n': { + stringstream(optarg) >> opt.iterations; + break; + } default: break; } } @@ -204,138 +189,138 @@ void ParseOptionsEMOnly(int argc, char **argv, ProgramOptions &opt) { bool CheckOptionsIndex(ProgramOptions& opt) { - bool ret = true; - - if (opt.k <= 0 || opt.k >= Kmer::MAX_K) { - cerr << "Error: invalid k-mer size " << opt.k << ", maximum is " << (Kmer::MAX_K -1) << endl; - ret = false; - } - - // we want to generate the index, check k, index and transfasta - struct stat stFileInfo; - auto intStat = stat(opt.transfasta.c_str(), &stFileInfo); - if (intStat != 0) { - cerr << "Error: transcript fasta file not found " << opt.transfasta << endl; - ret = false; - } - - if (opt.index.empty()) { - cerr << "Error: need to specify index name" << endl; - ret = false; - } - - return ret; + bool ret = true; + + if (opt.k <= 0 || opt.k >= Kmer::MAX_K) { + cerr << "Error: invalid k-mer size " << opt.k << ", maximum is " << (Kmer::MAX_K -1) << endl; + ret = false; + } + + // we want to generate the index, check k, index and transfasta + struct stat stFileInfo; + auto intStat = stat(opt.transfasta.c_str(), &stFileInfo); + if (intStat != 0) { + cerr << "Error: transcript fasta file not found " << opt.transfasta << endl; + ret = false; + } + + if (opt.index.empty()) { + cerr << "Error: need to specify index name" << endl; + ret = false; + } + + return ret; } bool CheckOptionsEM(ProgramOptions& opt, bool emonly = false) { - bool ret = true; - - - // check for index - if (!emonly) { - if (opt.index.empty()) { - cerr << "Error: index file missing" << endl; - ret = false; - } else { - struct stat stFileInfo; - auto intStat = stat(opt.index.c_str(), &stFileInfo); - if (intStat != 0) { - cerr << "Error: index file not found " << opt.index << endl; - ret = false; - } - } - } - - // check for read files - if (!emonly) { - if (opt.files.size() == 0) { - cerr << "Error: Missing read files" << endl; - ret = false; - } else { - struct stat stFileInfo; - for (auto &fn : opt.files) { - auto intStat = stat(fn.c_str(), &stFileInfo); - if (intStat != 0) { - cerr << "Error: file not found " << fn << endl; - ret = false; - } - } - } - - if (!(opt.files.size() == 1 || opt.files.size() == 2)) { - cerr << "Error: Input files should be 1 or 2 files only" << endl; - ret = false; - } - - if (opt.skip <= 0) { - cerr << "Error: skip has to be a positive integer" << endl; - ret = false; - } - } - - - if (opt.iterations <= 0) { - cerr << "Error: Invalid number of iterations " << opt.iterations << endl; - ret = false; - } - - if (opt.output.empty()) { - cerr << "Error: need to specify output directory " << opt.output << endl; - ret = false; - } else { - struct stat stFileInfo; - auto intStat = stat(opt.output.c_str(), &stFileInfo); - if (intStat == 0) { - // file/dir exits - if (!S_ISDIR(stFileInfo.st_mode)) { - cerr << "Error: file " << opt.output << " exists and is not a directory" << endl; - ret = false; - } else if (emonly) { - // check for directory/counts.txt - struct stat stCountInfo; - auto intcountstat = stat((opt.output + "/counts.txt" ).c_str(), &stCountInfo); - if (intcountstat != 0) { - cerr << "Error: could not find file " << opt.output << "/counts.txt" << endl; - ret = false; - } - - // check for directory/index.saved - struct stat stIndexInfo; - auto intindexstat = stat((opt.output + "/index.saved").c_str(), &stIndexInfo); - if (intindexstat != 0) { - cerr << "Error: could not find index " << opt.output << "/index.saved" << endl; - ret = false; - } - opt.index = (opt.output + "/index.saved"); - } - } else { - if (emonly) { - cerr << "Error: output directory needs to exist, run em first" << endl; - ret = false; - } else { - // create directory - if (mkdir(opt.output.c_str(), 0777) == -1) { - cerr << "Error: could not create directory " << opt.output << endl; - ret = false; - } - } - } - } - - if (opt.threads <= 0) { - cerr << "Error: invalid number of threads " << opt.threads << endl; - ret = false; - } else { - unsigned int n = std::thread::hardware_concurrency(); - if (n != 0 && n < opt.threads) { - cerr << "Warning: you asked for " << opt.threads - << ", but only " << n << " cores on the machine" << endl; - } - } - - - return ret; + bool ret = true; + + + // check for index + if (!emonly) { + if (opt.index.empty()) { + cerr << "Error: index file missing" << endl; + ret = false; + } else { + struct stat stFileInfo; + auto intStat = stat(opt.index.c_str(), &stFileInfo); + if (intStat != 0) { + cerr << "Error: index file not found " << opt.index << endl; + ret = false; + } + } + } + + // check for read files + if (!emonly) { + if (opt.files.size() == 0) { + cerr << "Error: Missing read files" << endl; + ret = false; + } else { + struct stat stFileInfo; + for (auto& fn : opt.files) { + auto intStat = stat(fn.c_str(), &stFileInfo); + if (intStat != 0) { + cerr << "Error: file not found " << fn << endl; + ret = false; + } + } + } + + if (!(opt.files.size() == 1 || opt.files.size() == 2)) { + cerr << "Error: Input files should be 1 or 2 files only" << endl; + ret = false; + } + + if (opt.skip <= 0) { + cerr << "Error: skip has to be a positive integer" << endl; + ret = false; + } + } + + + if (opt.iterations <= 0) { + cerr << "Error: Invalid number of iterations " << opt.iterations << endl; + ret = false; + } + + if (opt.output.empty()) { + cerr << "Error: need to specify output directory " << opt.output << endl; + ret = false; + } else { + struct stat stFileInfo; + auto intStat = stat(opt.output.c_str(), &stFileInfo); + if (intStat == 0) { + // file/dir exits + if (!S_ISDIR(stFileInfo.st_mode)) { + cerr << "Error: file " << opt.output << " exists and is not a directory" << endl; + ret = false; + } else if (emonly) { + // check for directory/counts.txt + struct stat stCountInfo; + auto intcountstat = stat((opt.output + "/counts.txt" ).c_str(), &stCountInfo); + if (intcountstat != 0) { + cerr << "Error: could not find file " << opt.output << "/counts.txt" << endl; + ret = false; + } + + // check for directory/index.saved + struct stat stIndexInfo; + auto intindexstat = stat((opt.output + "/index.saved").c_str(), &stIndexInfo); + if (intindexstat != 0) { + cerr << "Error: could not find index " << opt.output << "/index.saved" << endl; + ret = false; + } + opt.index = (opt.output + "/index.saved"); + } + } else { + if (emonly) { + cerr << "Error: output directory needs to exist, run em first" << endl; + ret = false; + } else { + // create directory + if (mkdir(opt.output.c_str(), 0777) == -1) { + cerr << "Error: could not create directory " << opt.output << endl; + ret = false; + } + } + } + } + + if (opt.threads <= 0) { + cerr << "Error: invalid number of threads " << opt.threads << endl; + ret = false; + } else { + unsigned int n = std::thread::hardware_concurrency(); + if (n != 0 && n < opt.threads) { + cerr << "Warning: you asked for " << opt.threads + << ", but only " << n << " cores on the machine" << endl; + } + } + + + return ret; } @@ -344,145 +329,139 @@ void PrintCite() { // cerr << "When using this program in your research, please cite" << endl << endl; } -void PrintVersion() -{ +void PrintVersion() { cout << "Kallisto, version: " << KALLISTO_VERSION << endl; } -void usage() -{ - cout << "Kallisto " << endl - << "Does transcriptome stuff" << endl << endl - << "Usage: Kallisto CMD [options] .." << endl << endl - << "Where can be one of:" << endl << endl - << " index Builds the index "<< endl - << " em Runs the EM algorithm " << endl - << " em-only Runs the EM algorithm without mapping" << endl - << " cite Prints citation information " << endl - << " version Prints version information"<< endl << endl; +void usage() { + cout << "Kallisto " << endl + << "Does transcriptome stuff" << endl << endl + << "Usage: Kallisto CMD [options] .." << endl << endl + << "Where can be one of:" << endl << endl + << " index Builds the index "<< endl + << " em Runs the EM algorithm " << endl + << " em-only Runs the EM algorithm without mapping" << endl + << " cite Prints citation information " << endl + << " version Prints version information"<< endl << endl; } -void usageIndex() -{ - cout << "Kallisto " << endl - << "Does transcriptome stuff" << endl << endl - << "Usage: Kallisto index [options]" << endl << endl - << "-k, --kmer-size=INT Size of k-mers, default (21), max value is " << (Kmer::MAX_K-1) << endl - << "-i, --index=STRING Filename for index to be constructed " << endl - << "-f, --trans-fasta=STRING FASTA file containing reference transcriptome " << endl - << " --verbose Print lots of messages during run" << endl; +void usageIndex() { + cout << "Kallisto " << endl + << "Does transcriptome stuff" << endl << endl + << "Usage: Kallisto index [options]" << endl << endl + << "-k, --kmer-size=INT Size of k-mers, default (21), max value is " << (Kmer::MAX_K-1) << endl + << "-i, --index=STRING Filename for index to be constructed " << endl + << "-f, --trans-fasta=STRING FASTA file containing reference transcriptome " << endl + << " --verbose Print lots of messages during run" << endl; } -void usageEM() -{ - cout << "Kallisto " << endl - << "Does transcriptome stuff" << endl << endl - << "Usage: Kallisto em [options] FASTQ-files" << endl << endl - << "-t, --threads=INT Number of threads to use (default value 1)" << endl - << "-i, --index=INT Filename for index " << endl - << "-s, --seed=INT Seed value for randomness (default value 0, use time based randomness)" << endl - << "-n, --iterations=INT Number of iterations of EM algorithm (default value 500)" << endl - << "-o, --output-dir=STRING Directory to store output to" << endl - << " --verbose Print lots of messages during run" << endl; +void usageEM() { + cout << "Kallisto " << endl + << "Does transcriptome stuff" << endl << endl + << "Usage: Kallisto em [options] FASTQ-files" << endl << endl + << "-t, --threads=INT Number of threads to use (default value 1)" << endl + << "-i, --index=INT Filename for index " << endl + << "-s, --seed=INT Seed value for randomness (default value 0, use time based randomness)" << endl + << "-n, --iterations=INT Number of iterations of EM algorithm (default value 500)" << endl + << "-o, --output-dir=STRING Directory to store output to" << endl + << " --verbose Print lots of messages during run" << endl; } -void usageEMOnly() -{ - cout << "Kallisto " << endl - << "Does transcriptome stuff" << endl << endl - << "Usage: Kallisto em-only [options]" << endl << endl - << "-t, --threads=INT Number of threads to use (default value 1)" << endl - << "-s, --seed=INT Seed value for randomness (default value 0, use time based randomness)" << endl - << "-n, --iterations=INT Number of iterations of EM algorithm (default value 500)" << endl - << "-o, --output-dir=STRING Directory to store output to" << endl - << " --verbose Print lots of messages during run" << endl; +void usageEMOnly() { + cout << "Kallisto " << endl + << "Does transcriptome stuff" << endl << endl + << "Usage: Kallisto em-only [options]" << endl << endl + << "-t, --threads=INT Number of threads to use (default value 1)" << endl + << "-s, --seed=INT Seed value for randomness (default value 0, use time based randomness)" << endl + << "-n, --iterations=INT Number of iterations of EM algorithm (default value 500)" << endl + << "-o, --output-dir=STRING Directory to store output to" << endl + << " --verbose Print lots of messages during run" << endl; } -int main(int argc, char *argv[]) -{ - - if (argc < 2) { - usage(); +int main(int argc, char *argv[]) { + + if (argc < 2) { + usage(); + exit(1); + } else { + ProgramOptions opt; + string cmd(argv[1]); + if (cmd == "version") { + PrintVersion(); + } else if (cmd == "cite") { + PrintCite(); + } else if (cmd == "index") { + if (argc==2) { + usageIndex(); + return 0; + } + ParseOptionsIndex(argc-1,argv+1,opt); + if (!CheckOptionsIndex(opt)) { + usageIndex(); + exit(1); + } else { + // create an index + Kmer::set_k(opt.k); + KmerIndex index(opt); + std::cerr << "Building index from: " << opt.transfasta << std::endl; + index.BuildTranscripts(opt.transfasta); + index.write(opt.index); + } + } else if (cmd == "em") { + if (argc==2) { + usageEM(); + return 0; + } + ParseOptionsEM(argc-1,argv+1,opt); + if (!CheckOptionsEM(opt)) { + usageEM(); exit(1); + } else { + // run the em algorithm + KmerIndex index(opt); + index.load(opt); + auto collection = ProcessReads>(index, opt); + // save modified index for future use + index.write((opt.output+"/index.saved"), false); + // compute mean frag length somewhere? + auto eff_lens = calc_eff_lens(index.trans_lens_, 30.0); + auto weights = calc_weights (collection.counts, index.ecmap, eff_lens); + EMAlgorithm em(opt, index, collection.counts, eff_lens, weights); + em.run(); + em.compute_rho(); + em.write(opt.output); + } + } else if (cmd == "em-only") { + if (argc==2) { + usageEMOnly(); + return 0; + } + ParseOptionsEMOnly(argc-1,argv+1,opt); + if (!CheckOptionsEM(opt, true)) { + usageEMOnly(); + exit(1); + } else { + // run the em algorithm + KmerIndex index(opt); + index.load(opt, false); // skip the k-mer map + MinCollector collection(index, opt); + collection.loadCounts(opt); + // compute mean frag length somewhere? + auto eff_lens = calc_eff_lens(index.trans_lens_, 30.0); + auto weights = calc_weights (collection.counts, index.ecmap, eff_lens); + EMAlgorithm em(opt, index, collection.counts, eff_lens, weights); + em.run(); + em.compute_rho(); + em.write(opt.output); + } } else { - ProgramOptions opt; - string cmd(argv[1]); - if (cmd == "version") { - PrintVersion(); - } else if (cmd == "cite") { - PrintCite(); - } else if (cmd == "index") { - if (argc==2) { - usageIndex(); - return 0; - } - ParseOptionsIndex(argc-1,argv+1,opt); - if (!CheckOptionsIndex(opt)) { - usageIndex(); - exit(1); - } else { - // create an index - Kmer::set_k(opt.k); - KmerIndex index(opt); - std::cerr << "Building index from: " << opt.transfasta << std::endl; - index.BuildTranscripts(opt.transfasta); - index.write(opt.index); - } - } else if (cmd == "em") { - if (argc==2) { - usageEM(); - return 0; - } - ParseOptionsEM(argc-1,argv+1,opt); - if (!CheckOptionsEM(opt)) { - usageEM(); - exit(1); - } else { - // run the em algorithm - KmerIndex index(opt); - index.load(opt); - auto collection = ProcessReads>(index, opt); - // save modified index for future use - index.write((opt.output+"/index.saved"), false); - // compute mean frag length somewhere? - auto eff_lens = calc_eff_lens(index.trans_lens_, 30.0); - auto weights = calc_weights (collection.counts, index.ecmap, eff_lens); - EMAlgorithm em(opt, index, collection.counts, eff_lens, weights); - em.run(); - em.compute_rho(); - em.write(opt.output); - } - } else if (cmd == "em-only") { - if (argc==2) { - usageEMOnly(); - return 0; - } - ParseOptionsEMOnly(argc-1,argv+1,opt); - if (!CheckOptionsEM(opt, true)) { - usageEMOnly(); - exit(1); - } else { - // run the em algorithm - KmerIndex index(opt); - index.load(opt, false); // skip the k-mer map - MinCollector collection(index, opt); - collection.loadCounts(opt); - // compute mean frag length somewhere? - auto eff_lens = calc_eff_lens(index.trans_lens_, 30.0); - auto weights = calc_weights (collection.counts, index.ecmap, eff_lens); - EMAlgorithm em(opt, index, collection.counts, eff_lens, weights); - em.run(); - em.compute_rho(); - em.write(opt.output); - } - } else { - cerr << "Did not understand command " << cmd << endl; - usage(); - exit(1); - } - - } - return 0; + cerr << "Did not understand command " << cmd << endl; + usage(); + exit(1); + } + + } + return 0; } diff --git a/src/weights.cpp b/src/weights.cpp deleted file mode 100644 index e69de29b..00000000 diff --git a/src/weights.h b/src/weights.h index 2fec4762..996b9028 100644 --- a/src/weights.h +++ b/src/weights.h @@ -9,49 +9,46 @@ using WeightMap = std::unordered_map>; -std::vector calc_eff_lens(const std::vector& lengths, double mean) -{ - // for now do the total naive thing and subtract mean frag length - std::vector eff_lens; - eff_lens.reserve(lengths.size()); - - for (auto& cur_len: lengths) - { - eff_lens.push_back( static_cast(cur_len) - mean + 1.0 ); - } +std::vector calc_eff_lens(const std::vector& lengths, double mean) { + // for now do the total naive thing and subtract mean frag length + std::vector eff_lens; + eff_lens.reserve(lengths.size()); + + for (auto& cur_len: lengths) { + eff_lens.push_back( static_cast(cur_len) - mean + 1.0 ); + } - return eff_lens; + return eff_lens; } WeightMap calc_weights( - const std::vector& counts, - const EcMap& ecmap, - const std::vector& eff_lens) -{ + const std::vector& counts, + const EcMap& ecmap, + const std::vector& eff_lens) { - // TODO: throw some assertions in here to make sure the length of counts - // and ec map are correct... as well as eff_lens size is reasonable + // TODO: throw some assertions in here to make sure the length of counts + // and ec map are correct... as well as eff_lens size is reasonable - // weights are stored _exactly_ in the same orientation as the ec map - WeightMap weights; + // weights are stored _exactly_ in the same orientation as the ec map + WeightMap weights; - for (auto& kv : ecmap) { + for (auto& kv : ecmap) { - //std::cout << kv.first; - std::vector trans_weights; - trans_weights.reserve(kv.second.size()); + //std::cout << kv.first; + std::vector trans_weights; + trans_weights.reserve(kv.second.size()); - for (auto& trans_id : kv.second) { - trans_weights.push_back( static_cast(counts[kv.first]) / - eff_lens[trans_id] ); - } - - weights.insert( {kv.first, trans_weights} ); + for (auto& trans_id : kv.second) { + trans_weights.push_back( static_cast(counts[kv.first]) / + eff_lens[trans_id] ); } + weights.insert( {kv.first, trans_weights} ); + } + - return weights; + return weights; } #endif // KALLISTO_WEIGHTS_H