diff --git a/include/BFHClass.hpp b/include/BFHClass.hpp index f3053d2..873e276 100644 --- a/include/BFHClass.hpp +++ b/include/BFHClass.hpp @@ -28,9 +28,9 @@ class BFHClass{ public: BFHClass(){} - BFHClass(std::shared_ptr& consoleLogIn){ - consoleLog = consoleLogIn ; - } + BFHClass(std::shared_ptr& consoleLogIn){ + consoleLog = consoleLogIn ; + } void loadBFH( std::string& bfhFile, @@ -39,7 +39,8 @@ class BFHClass{ std::map& cellWhiteListMap, bool generateNoiseProfile, std::unordered_map& cellNoisyMap, - std::string& outDir + std::string& outDir, + bool dump = false ) ; void loadProbability(std::string& file, Reference& refInfo, bool geneLevel) ; @@ -50,7 +51,7 @@ class BFHClass{ } - std::shared_ptr consoleLog ; + std::shared_ptr consoleLog ; std::string bfhFile ; std::vector countProbability ; std::unordered_map> geneCountHistogram ; // Gene id -> (EqClass_Length -> Numebr) diff --git a/include/GFAReader.hpp b/include/GFAReader.hpp index be45079..f3595a6 100644 --- a/include/GFAReader.hpp +++ b/include/GFAReader.hpp @@ -9,9 +9,11 @@ class GFAReader{ public: GFAReader( - std::string gfaFileIn + std::string gfaFileIn, + std::shared_ptr& consoleLogIn ){ gfaFileName_ = gfaFileIn ; + consoleLog = consoleLogIn ; } void parseFile( @@ -19,6 +21,10 @@ class GFAReader{ ) ; void readUnitigs() ; + std::vector> explode( + const std::string str, + const char& ch + ); void updateEqClass( std::string& transcriptName, @@ -55,7 +61,7 @@ class GFAReader{ std::unique_ptr file ; std::unordered_map unitigMap ; - + std::shared_ptr consoleLog; }; diff --git a/include/MatrixParser.hpp b/include/MatrixParser.hpp index 38ab9d2..7c1adec 100644 --- a/include/MatrixParser.hpp +++ b/include/MatrixParser.hpp @@ -160,17 +160,17 @@ public : std::vector> data ; // Matrix containing the Cell x Transcriptome Matrix std::vector> geneCounts ; // Matrix containing the Cell x Gene Matrix std::vector> trueGeneCounts ; // True Matrix containing the Cell x Gene Matrix - std::vector cellNames ; // Vector of Cell Names - + // cell specific + std::vector cellNames ; // Vector of Cell Names std::map cellNamesMap ; // Cell Name -> Cell Id std::map cellNamesDupCount ; // Cell Name -> dedup count - std::map cellWhiteListMap ; std::unordered_map cellNoisyMap ; std::unordered_map cellDoubletMap ; - std::unordered_map cell2ClusterMap ; + std::unordered_map cell2ClusterMap ; + std::shared_ptr consoleLog ; // Logger for outputting errors std::map alevin2refMap ; // Map Col 0f Input Matrix -> Gene ID from t2g tsv diff --git a/src/BFHClass.cpp b/src/BFHClass.cpp index 317f2b2..8beaf82 100644 --- a/src/BFHClass.cpp +++ b/src/BFHClass.cpp @@ -37,22 +37,22 @@ void BFHClass::loadBFH( std::map& cellWhiteListMap, bool generateNoiseProfile, std::unordered_map& cellNoisyMap, - std::string& outDir + std::string& outDir, + bool dump ){ if(! util::fs::FileExists(bfhFile.c_str())){ - std::cerr << bfhFile << " does not exists \n" ; + consoleLog->error("{} does not exists", bfhFile) ; std::exit(1) ; } bool createClusterLevelHist{false} ; if(cellClusterFile != ""){ - std::cout << "[DEBUG] cell Clust file " << cellClusterFile << "\n" ; - + consoleLog->info("Feeding cell cluster file {}", cellClusterFile) ; if(! util::fs::FileExists(cellClusterFile.c_str())){ - std::cerr << cellClusterFile << " is not empty and does not exist \n" ; + consoleLog->error("Cell cluster file {} does not exist", cellClusterFile) ; std::exit(1) ; } createClusterLevelHist = true ; @@ -72,16 +72,15 @@ void BFHClass::loadBFH( auto cell_id = it->second ; cell2ClusterMap[cell_id] = cluster_id ; }else{ - std::cerr << "Avoiding this cluster\n" ; + consoleLog->error("Avoiding {} cluster", cluster_id) ; } } - std::cerr << "[DEBUG] read cluster file with size " << cell2ClusterMap.size() << "\n" ; + consoleLog->info("read cluster file with size {}",cell2ClusterMap.size()); } - std::cerr<< "[DEBUG] Reading BFH file ........ \n" ; - - std::ifstream dataStream(bfhFile.c_str()) ; - std::string line ; + consoleLog->info("Reading BFH file {}",bfhFile); + std::ifstream dataStream(bfhFile.c_str()) ; + std::string line ; std::getline(dataStream, line) ; uint32_t numTranscripts = std::stoul(line) ; @@ -90,9 +89,9 @@ void BFHClass::loadBFH( std::getline(dataStream, line) ; uint32_t numEqClasses = std::stoul(line) ; - std::cerr << "[DEBUG] numTranscripts: " << numTranscripts << "\n" ; - std::cerr << "[DEBUG] numCells: " << numCells << "\n" ; - std::cerr << "[DEBUG] numEqClasses: " << numEqClasses << "\n" ; + consoleLog->info("numTranscripts: {}", numTranscripts) ; + consoleLog->info("numCells: {}", numCells) ; + consoleLog->info("numEqClasses: {}", numEqClasses) ; std::vector trNames(numTranscripts) ; std::vector CBNames(numCells) ; @@ -101,14 +100,14 @@ void BFHClass::loadBFH( trNames[i] = line ; } - std::cerr << "[DEBUG] Transcripts read \n" ; + consoleLog->info("Transcripts read ") ; for(size_t i = 0; i < numCells; ++i){ std::getline(dataStream, line) ; CBNames[i] = line ; } - std::cerr << "[DEBUG] Cell names read \n" ; + consoleLog->info("Cell names read ") ; // read equivalence classes now uint32_t tot_reads{0} ; @@ -129,7 +128,10 @@ void BFHClass::loadBFH( auto gid = transcript2geneMap[tid] ; geneIds.insert(gid) ; }else{ - std::cerr << "transcript is in the list but no corresponding gene found \n" ; + consoleLog->error("transcript is in the list but no corresponding gene found " + "this signifies that the BFH and the annotation does not belong " + "to the same annotation (e.g. gencode version) or same organism" + ) ; std::exit(2) ; } } @@ -195,7 +197,7 @@ void BFHClass::loadBFH( } - std::cerr << "[DEBUG] countHistogram.size() " << countHistogram.size() << "\n" ; + consoleLog->info("countHistogram.size(): {}", countHistogram.size()); @@ -203,21 +205,23 @@ void BFHClass::loadBFH( [](const std::pair& p1, const std::pair& p2) { return p1.first < p2.first; }); - std::cerr << "[DEBUG] x->first, x->second tot_reads " - << x->first << "\t" << x->second - << "\t" << tot_reads << "\n" ; + consoleLog->info("Histogram statistics " + "max value: {}, max freq: {}, total reads {}", + x->first, x->second, tot_reads + ); + consoleLog->info("Converting histogram to a probablity vector"); countProbability.resize(x->first + 1, 0.0) ; for(auto it: countHistogram){ if(it.first >= countProbability.size()){ - std::cerr << "[DEBUG] out of memory " << it.first << "\t" << countProbability.size() << "\n" ; + consoleLog->error("[DEBUG] out of memory {} >= {}",it.first,countProbability.size()) ; } countProbability[it.first] = static_cast(it.second)/static_cast(tot_reads) ; } - + if(dump) { std::string geneCountHistogramFile = outDir + "/geneLevelProb.txt" ; std::cerr << "DEBUG: " << geneCountHistogramFile << "\n" ; @@ -237,7 +241,7 @@ void BFHClass::loadBFH( } - + if(dump) { std::string countProbabilityFile = outDir + "/countProb.txt" ; std::ofstream probStream(countProbabilityFile.c_str()) ; @@ -248,7 +252,7 @@ void BFHClass::loadBFH( } } - std::cerr << "[DEBUG] Exiting after reading BFH \n" ; + consoleLog->info("Exiting after reading BFH ") ; } diff --git a/src/GFAReader.cpp b/src/GFAReader.cpp index d4a3048..48c5abb 100644 --- a/src/GFAReader.cpp +++ b/src/GFAReader.cpp @@ -1,7 +1,7 @@ #include "GFAReader.hpp" #include "macros.hpp" -// Taken from here +// Parts of the methods are taken from here // https://github.com/COMBINE-lab/pufferfish/blob/master/src/GFAConverter.cpp bool is_number(const std::string& s) { @@ -17,7 +17,10 @@ std::string getGencodeTranscript(std::string v){ } -std::vector> explode(const std::string str, const char& ch) { +std::vector> GFAReader::explode( + const std::string str, + const char& ch +) { std::string next; std::vector> result; // For each character in the string @@ -36,7 +39,10 @@ std::vector> explode(const std::string str, const char& result.emplace_back(nid, orientation); } catch (std::exception& e) { // not a numeric contig id - std::cerr << "tried to convert " << next << " into a long long\n"; + consoleLog->error("tried to convert {}" + " into a long long", + next + ); std::exit(1); } next.clear(); @@ -47,8 +53,10 @@ std::vector> explode(const std::string str, const char& } } if (!next.empty()) { - std::cerr << "impossible is the opposite of possible " << next << "\n"; - std::cerr << "The line is " << str << "\n"; + consoleLog->error("impossible is the opposite of possible {} " + "The line is {} ", + next,str + ); result.emplace_back(std::stoll(next), true); // this case shouldn't even happen } @@ -139,129 +147,134 @@ void GFAReader::readUnitigs(){ void GFAReader::parseFile( Reference& refInfo ){ - std::cerr << "Start loading segments... \n" ; - size_t contigCt{0} ; - std::string ln, tag, id, value ; + consoleLog->info("Parsing GFA file {}", gfaFileName_); + consoleLog->info("Start loading segments...") ; + size_t contigCt{0} ; + std::string ln, tag, id, value ; - // calculate overlap size on the fly - bool foundOverlapSize{false}; - size_t overlapsize = READ_LEN-1; - file.reset(new std::ifstream(gfaFileName_)) ; + //We need to calculate the overlap size on the fly + //if and only if the new pufferize TwoPaCo is run + //otherwise it is not important, we can go without + + //ThreePaCo + //overlapSize = READ_LEN - 1 + //TwoPaCo + //overlapSize = READ_LEN + 1 - size_t maxContigId{0} ; - while(std::getline(*file, ln)){ - char fastC = ln[0] ; - if(fastC != 'S') - continue ; - - std::vector tokens ; - util::split(ln, tokens, "\t") ; + bool foundOverlapSize{false}; + // size_t overlapsize = READ_LEN-1; + size_t overlapsize = READ_LEN + 1; + consoleLog->info("Predicted overlap size: {}", overlapsize); - id = tokens[1] ; - value = tokens[2] ; - if(is_number(id)){ - size_t contigId = std::stoll(id) ; - if(contigId > maxContigId) - maxContigId = contigId ; + file.reset(new std::ifstream(gfaFileName_)) ; + size_t maxContigId{0} ; + while(std::getline(*file, ln)){ + char fastC = ln[0] ; + if(fastC != 'S') + continue ; + + std::vector tokens ; + util::split(ln, tokens, "\t") ; + + id = tokens[1] ; + value = tokens[2] ; + if(is_number(id)){ + size_t contigId = std::stoll(id) ; + if(contigId > maxContigId) + maxContigId = contigId ; + if (value != "*"){ unitigMap[contigId] = value ; - } - contigCt++ ; - } - - std::cerr << "Saw " << contigCt << " contigs in total, unitigMap.size(): " << unitigMap.size() << "\n" ; - std::cerr << "Max contig id " << maxContigId << "\n" ; - std::cerr << "Starting to load paths \n" ; + } + } + contigCt++ ; + } - file.reset(new std::ifstream(gfaFileName_)) ; + consoleLog->info("Saw {} segment lines, number of unitigs {}",contigCt, unitigMap.size()) ; + //std::cerr << "Max contig id " << maxContigId << "\n" ; + //std::cerr << "Starting to load paths \n" ; + + // reset the file + file.reset(new std::ifstream(gfaFileName_)) ; + // find overlap size + while(std::getline(*file, ln)){ + char fastC = ln[0] ; + if(fastC != 'P') + continue ; + std::vector tokens ; + util::split(ln, tokens, "\t") ; + if(tokens.size() != 4){ + continue ; + } + // sparse this to get the transcript name + id = getGencodeTranscript(tokens[1]) ; + if(id == ""){ + continue ; + } - // find overlap size - while(std::getline(*file, ln)){ - char fastC = ln[0] ; - if(fastC != 'P') - continue ; - std::vector tokens ; - util::split(ln, tokens, "\t") ; - if(tokens.size() != 4){ - continue ; + // A valid line + auto pvalue = tokens[2] ; + std::vector> contigVec = explode(pvalue, ',') ; + // calculate overlap size + if(!foundOverlapSize and contigVec.size() > 2){ + auto selem1 = contigVec[0]; + auto selem2 = contigVec[1]; + std::string elem1, elem2; + if (!selem1.second){ + elem1 = util::revcomp(unitigMap[selem1.first]); + }else{ + elem1 = unitigMap[selem1.first]; } - // sparse this to get the transcript name - id = getGencodeTranscript(tokens[1]) ; - if(id == ""){ - continue ; + if (!selem2.second){ + elem2 = util::revcomp(unitigMap[selem2.first]); + }else{ + elem2 = unitigMap[selem2.first]; } - - // A valid line - auto pvalue = tokens[2] ; - std::vector> contigVec = explode(pvalue, ',') ; - // calculate overlap size - if(!foundOverlapSize and contigVec.size() > 2){ - auto selem1 = contigVec[0]; - auto selem2 = contigVec[1]; - std::string elem1, elem2; - if (!selem1.second){ - elem1 = util::revcomp(unitigMap[selem1.first]); - }else{ - elem1 = unitigMap[selem1.first]; - } - if (!selem2.second){ - elem2 = util::revcomp(unitigMap[selem2.first]); - }else{ - elem2 = unitigMap[selem2.first]; + while(elem2.substr(0,overlapsize) != elem1.substr(elem1.size()-overlapsize)){ + overlapsize++; + if(overlapsize == elem1.size() || overlapsize == elem2.size()){ + consoleLog->error("GFA is ill-constructed") ; + //std::cout << elem1 << "\t" << elem2 << "\t" << overlapsize << "\n"; + std::exit(1); } - while(elem2.substr(0,overlapsize) != elem1.substr(elem1.size()-overlapsize)){ - overlapsize++; - if(overlapsize == elem1.size() || overlapsize == elem2.size()){ - std::cout << "GFA is ill-constructed\n" ; - std::cout << elem1 << "\t" << elem2 << "\t" << overlapsize << "\n"; - std::exit(1); - } - } - std::cout << "Overlap size " << overlapsize << "\n"; - - foundOverlapSize = true; } - if (foundOverlapSize){ - break; - } - } + consoleLog->info("Calculated overlap size {}", overlapsize); - // now update the eqclasses - file.reset(new std::ifstream(gfaFileName_)) ; - - while(std::getline(*file, ln)){ - char fastC = ln[0] ; - if(fastC != 'P') - continue ; - std::vector tokens ; - util::split(ln, tokens, "\t") ; - if(tokens.size() != 4){ - continue ; - } - // sparse this to get the transcript name - id = getGencodeTranscript(tokens[1]) ; + foundOverlapSize = true; + } + if (foundOverlapSize){ + break; + } + } - if(id == ""){ - continue ; - } + // now update the eqclasses + file.reset(new std::ifstream(gfaFileName_)) ; + consoleLog->info("Start loading paths..."); + while(std::getline(*file, ln)){ + char fastC = ln[0] ; + if(fastC != 'P') + continue ; + std::vector tokens ; + util::split(ln, tokens, "\t") ; + if(tokens.size() != 4){ + continue ; + } + // parse this to get the transcript name + id = getGencodeTranscript(tokens[1]) ; - // A valid line - auto pvalue = tokens[2] ; - std::vector> contigVec = explode(pvalue, ',') ; + if(id == ""){ + continue ; + } - updateEqClass(id, contigVec, refInfo, overlapsize) ; + // A valid line + auto pvalue = tokens[2] ; + std::vector> contigVec = explode(pvalue, ',') ; - } + updateEqClass(id, contigVec, refInfo, overlapsize) ; - - - - std::cerr << "Done with GFA \n" - << "Equivalece class size " << eqClassMap.size() - << "\ttrSegmentMap size " << trSegmentMap.size() - << "\ttranscript map size " << refInfo.transcriptNameMap.size() << "\n" ; + } // filter segements that is does not have // any transcript where it is within @@ -276,9 +289,9 @@ void GFAReader::parseFile( }else{ distanceFromEndMap[tid] = 0 ; } - if(tid == 11393){ - std::cout << "[DEBUG]-----" << distanceFromEndMap[tid]<<"\n"; - } + // if(tid == 11393){ + // std::cout << "[DEBUG]-----" << distanceFromEndMap[tid]<<"\n"; + // } } std::unordered_set removeKeys ; @@ -334,13 +347,22 @@ void GFAReader::parseFile( trSegmentMap[tid].push_back(id) ; } } + + consoleLog->info("Done with GFA " + "Equivalence class size {} " + "Segment map size after filtering {} " + "number of transcripts {}", + eqClassMap.size(), + trSegmentMap.size(), + refInfo.transcriptNameMap.size() + ); + if(trSegmentMap.size() < refInfo.transcriptNameMap.size()){ + consoleLog->warn("{} transcripts will not be included as " + "they don't have suitable segments", + refInfo.transcriptNameMap.size() - trSegmentMap.size() + ); + } - std::cerr << "Done Filtering \n" - << "Equivalece class size " << eqClassMap.size() - << "\ttrSegmentMap size " << trSegmentMap.size() - << "\ttranscript map size " << refInfo.transcriptNameMap.size() << "\n" ; - - //std::cerr << "After filtering eqclassSize " << eqClassMap.size() << " \n" ; } diff --git a/src/MatrixParser.cpp b/src/MatrixParser.cpp index fe99b3c..3e3ebbe 100644 --- a/src/MatrixParser.cpp +++ b/src/MatrixParser.cpp @@ -233,7 +233,7 @@ void populateGeneCountMatrix( totSum += geneCount[i][j] ; } } - std::cerr << "Total sum " << totSum << "\n" ; + std::cerr << "\n\tTotal sum " << totSum << "\n" ; } @@ -327,7 +327,7 @@ void DataMatrix::loadAlevinData( ){ // Load the values from simOpts - // Basic Options + // Basic Options auto alevinDir = simOpts.inputdir; auto sampleCells = simOpts.sampleCells; auto outDir = simOpts.outDir ; @@ -335,7 +335,7 @@ void DataMatrix::loadAlevinData( auto gfaFile = simOpts.gfaFile ; auto bfhFile = simOpts.bfhFile ; - // Advanced options + // Advanced options bool samplePolyA = simOpts.samplePolyA; bool dupCounts = simOpts.dupCounts ; bool generateNoisyCells = simOpts.generateNoisyCells ; @@ -346,11 +346,11 @@ void DataMatrix::loadAlevinData( // load alevin related files if(! util::fs::DirExists(alevinDir.c_str())){ - consoleLog->info("Alevin directory does not exists") ; + consoleLog->error("Alevin directory does not exists") ; std::exit(1) ; } if(!simOpts.useWhiteList && generateNoisyCells){ - consoleLog->info("--generateNoisyCells needs to be invoked in conjunction with --useWhiteList") ; + consoleLog->warn("--generateNoisyCells needs to be invoked in conjunction with --useWhiteList") ; } std::ifstream indata ; @@ -360,74 +360,76 @@ void DataMatrix::loadAlevinData( // reference. This is explicitely obtained fron alevin run if (dupCounts){ - consoleLog->info("Reading duplicated read numbers") ; - bool alevin_updated{false} ; - std::string dupCountFile_1 = alevinDir + "/MappedUmi.txt" ; - std::string dupCountFile_2 = alevinDir + "/featureDump.txt" ; + consoleLog->info("Reading duplicated read numbers") ; + bool alevin_updated{false} ; + std::string dupCountFile_1 = alevinDir + "/MappedUmi.txt" ; + std::string dupCountFile_2 = alevinDir + "/featureDump.txt" ; - std::string dupCountFile ; + std::string dupCountFile ; - if(! util::fs::FileExists(dupCountFile_2.c_str())){ - if(! util::fs::FileExists(dupCountFile_1.c_str())){ - std::cerr << "Neither MappedUmi.txt nor featureDump.txt exists run without --dupCounts\n" ; - std::exit(2) ; - }else{ - dupCountFile = dupCountFile_1 ; - } - }else{ - alevin_updated = true ; - dupCountFile = dupCountFile_2 ; - } - - if(!alevin_updated){ - std::ifstream dupCountStream(dupCountFile) ; - std::string line ; - while(std::getline(dupCountStream, line)){ - line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); - std::vector tokens ; - util::split(line, tokens, "\t") ; - if (tokens.size() == 2){ - cellNamesDupCount[tokens[0]] = std::stoul(tokens[1]) ; - } - } - }else{ + if(! util::fs::FileExists(dupCountFile_2.c_str())){ + if(! util::fs::FileExists(dupCountFile_1.c_str())){ + consoleLog->error("Neither MappedUmi.txt nor featureDump.txt" + "exists run without --dupCounts") ; + std::exit(2) ; + }else{ + dupCountFile = dupCountFile_1 ; + } + }else{ + alevin_updated = true ; + dupCountFile = dupCountFile_2 ; + } - std::ifstream dupCountStream(dupCountFile) ; - std::string line ; + if(!alevin_updated){ + std::ifstream dupCountStream(dupCountFile) ; + std::string line ; + while(std::getline(dupCountStream, line)){ + line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); + std::vector tokens ; + util::split(line, tokens, "\t") ; + if (tokens.size() == 2){ + cellNamesDupCount[tokens[0]] = std::stoul(tokens[1]) ; + } + } + }else{ + std::ifstream dupCountStream(dupCountFile) ; + std::string line ; - // throw away the first line - std::getline(dupCountStream, line) ; + // throw away the first line + std::getline(dupCountStream, line) ; - while(std::getline(dupCountStream, line)){ - line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); - std::vector tokens ; - util::split(line, tokens, "\t") ; - if (tokens.size() > 2){ - cellNamesDupCount[tokens[0]] = std::stoul(tokens[2]) ; - } - } - } + while(std::getline(dupCountStream, line)){ + line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); + std::vector tokens ; + util::split(line, tokens, "\t") ; + if (tokens.size() > 2){ + cellNamesDupCount[tokens[0]] = std::stoul(tokens[2]) ; + }else{ + consoleLog->error("{} is does not have enough columns", dupCountFile); + std::exit(1); + } + } + } - if(!dupCounts && (simOpts.numMolFile != "")){ - if(! util::fs::FileExists(simOpts.numMolFile.c_str())){ - std::cerr << simOpts.numMolFile << " does not exist\n" ; - }else{ - std::ifstream dupCountStream(simOpts.numMolFile) ; - std::string line ; - while(std::getline(dupCountStream, line)){ - line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); - std::vector tokens ; - util::split(line, tokens, "\t") ; - if (tokens.size() == 2){ - cellNamesDupCount[tokens[0]] = std::stoul(tokens[1]) ; - } - } + if(!dupCounts && (simOpts.numMolFile != "")){ + if(! util::fs::FileExists(simOpts.numMolFile.c_str())){ + std::cerr << simOpts.numMolFile << " does not exist\n" ; + }else{ + std::ifstream dupCountStream(simOpts.numMolFile) ; + std::string line ; + while(std::getline(dupCountStream, line)){ + line.erase(std::remove(line.begin(), line.end(), '\n'), line.end()); + std::vector tokens ; + util::split(line, tokens, "\t") ; + if (tokens.size() == 2){ + cellNamesDupCount[tokens[0]] = std::stoul(tokens[1]) ; + } + } - } + } + } } - - } - // End reading the duplicated counts + // End reading the duplicated counts // The map contains gid to tid map, where @@ -455,8 +457,8 @@ void DataMatrix::loadAlevinData( size_t numOfOriginalGenes{0} ; - std::cout<<"===============================================================\n\n" ; - consoleLog->info("Start parsing Alevin Directory") ; + consoleLog->info("====================Parsing Alevin Directory==========================") ; + consoleLog->info("Start parsing Alevin Directory") ; consoleLog->info("Parsing {}/quants_mat_cols.txt",alevinDir) ; { std::string geneListFile = alevinDir + "/quants_mat_cols.txt" ; @@ -486,20 +488,22 @@ void DataMatrix::loadAlevinData( } } - std::string cellColFile = simOpts.outDir + "/alevin/quants_mat_cols.txt" ; - std::ofstream cellColStream(cellColFile.c_str()) ; - for(uint32_t i= 0; i < alevinGeneIndex2NameMap.size() ; ++i){ - cellColStream << alevinGeneIndex2NameMap[i] << "\n" ; + //FIXME: One should not write the genes here. They can be truncated + // std::string cellColFile = simOpts.outDir + "/alevin/quants_mat_cols.txt" ; + // std::ofstream cellColStream(cellColFile.c_str()) ; + // for(uint32_t i= 0; i < alevinGeneIndex2NameMap.size() ; ++i){ + // cellColStream << alevinGeneIndex2NameMap[i] << "\n" ; + // } + if (numOfSkippedGenes > 0){ + consoleLog->warn("Original number of genes: {}\tNumber of genes skipped: {}", numOfOriginalGenes, numOfSkippedGenes) ; + consoleLog->warn("This means not all genes given in the input alevin matrix will be utilized"); } - consoleLog->info("Original number of genes: {}\tNumber of genes skipped: {}", numOfOriginalGenes, numOfSkippedGenes) ; - - consoleLog->info("Number of genes in the alevin produced files: {}",alevin2refMap.size()) ; + // consoleLog->info("Number of genes in the alevin produced files: {}",alevin2refMap.size()) ; auto& gene2transcriptMap = refInfo.gene2transcriptMap ; - // alevin2refTranscriptMap, a map from columns of the - // cell x transcrip count matrix to be fromed to the + // cell x transcript count matrix to be formed to the // transcript ids of the reference. This map is *very* // important since we need the reference id to get the // real sequences. @@ -521,11 +525,11 @@ void DataMatrix::loadAlevinData( } }else{ consoleLog->error("This should not happen: gene {} not found",gIt->first) ; + std::exit(5); } } } - consoleLog->info("Parsing {}/quants_mat_rows.txt",alevinDir) ; - + consoleLog->info("Parsing {}/quants_mat_rows.txt",alevinDir) ; // all cell names irrespective of whitelist std::map allCellListMap ; // map cell-name -> id std::vector allCellNames ; // {cell names} @@ -554,8 +558,6 @@ void DataMatrix::loadAlevinData( // Read the whitelist to know which cells are going to be finally part of the matrix. // If the whitelist is not present then treat the rows as whitelist - - std::string cellListFile = alevinDir + "/whitelist.txt" ; // When noisy cells are present and considered, then // allCells are the combination of whitelist cells and @@ -586,8 +588,7 @@ void DataMatrix::loadAlevinData( if((! util::fs::FileExists(cellListFile.c_str())) or !(simOpts.useWhiteList)){ consoleLog->info("whitelist.txt file does not exist/ or will NOT be used"); - consoleLog->info("we need to assume that the rows are the whitelisted barcodes") ; - + consoleLog->info("we need to assume that the rows are the whitelisted barcodes") ; // Copy the vector and map for now, and try something better // later cellNames = allCellNames ; @@ -612,20 +613,19 @@ void DataMatrix::loadAlevinData( for(size_t i = 0 ; i < cellNames.size(); ++i){ original2whitelistMap[allCellListMap[cellNames[i]]] = i ; } - - // NOTE: Noisy cells if neccessary will be appended // in the same vector of cell names. This makes // cellWhiteListMap deprecated for the case where // whitelist is provided. as original2whitelistMap and // allCellListMap together provide the same information + // TODO: remove cellWhiteListMap in future release - // NOTE: Not in use now - + + // NOTE: Not in use now consoleLog->info("Number of cells in whitelist file: {}", cellNames.size()) ; if(generateNoisyCells){ consoleLog->info("Additionally reads from noisy cells will be generated too," - "keeping track of noisy cells\n"); + "keeping track of noisy cells"); if(numOfNoisyCells == 0){ numOfNoisyCells = allCellNames.size() - cellNames.size() ; @@ -687,13 +687,13 @@ void DataMatrix::loadAlevinData( std::string countFile = alevinDir + "/quants_mat.csv" ; std::string countFileBinary = alevinDir + "/quants_mat.gz" ; - bool binary{true} ; - if(! util::fs::FileExists(countFileBinary.c_str())){ - binary = false ; - } + bool binary{true} ; + if(! util::fs::FileExists(countFileBinary.c_str())){ + binary = false ; + } - // TODO: Not sure if this is needed any more + // TODO: Not sure if this is needed any more std::unordered_map gene2LastTrMap ; { // Make a map from gene id to last last tid @@ -706,7 +706,7 @@ void DataMatrix::loadAlevinData( } - // NOTE: This is important for intron retention + // NOTE: This is important for intron retention // Decide whether to sample from the introns or not // depending on that use a flag for that cell and gene id // store a flag @@ -737,7 +737,7 @@ void DataMatrix::loadAlevinData( if(useDBG){ // This will read the dbg now if(!util::fs::FileExists(gfaFile.c_str())){ - std::cerr << gfaFile << "GFA file should exist EXITING !!"; + consoleLog->error("GFA file {} does not exist EXITING !!", gfaFile); std::exit(4) ; } @@ -759,21 +759,22 @@ void DataMatrix::loadAlevinData( } } }else{ - std::cerr << "RSPD file is not empty and doesn't exist, going with truncated sampling\n" ; + consoleLog->info("RSPD file is not empty and doesn't exist, going with truncated sampling") ; } } + consoleLog->info("======================= Parsing GFA file {} ==========================",gfaFile) ; - dbgPtr = new GFAReader(gfaFile) ; + dbgPtr = new GFAReader(gfaFile, consoleLog) ; dbgPtr->parseFile(refInfo) ; - std::cerr << " Read GFA file \n" << std::flush ; // NOTE: stand alone call to bfh // ignore other eqclass stuff for now - // They might come handy later. - std::string eqFileDir = "dummy/dir" ; - eqClassPtr = new BFHClass() ; + // They might come handy later. + // std::string eqFileDir = "dummy/dir" ; + consoleLog->info("======================= Parsing BFH/related file ==========================") ; + eqClassPtr = new BFHClass(consoleLog) ; if((simOpts.countProbFile != "") || (simOpts.geneProbFile != "")){ if(simOpts.countProbFile != ""){ eqClassPtr->loadProbability( @@ -789,8 +790,8 @@ void DataMatrix::loadAlevinData( ) ; } }else if(bfhFile != ""){ - // NOTE: This branch is currently - // active + // NOTE: This branch is currently + // active eqClassPtr->loadBFH( bfhFile, simOpts.clusterFile, @@ -801,8 +802,15 @@ void DataMatrix::loadAlevinData( simOpts.outDir ) ; }else{ - // NOTE Or load currently existing default file + // FIXME: Or load currently existing default file, this + // shoud be changed std::string geneProbFile = "../data/hg/geneLebelProb_pbmc_4k.txt" ; + if(!util::fs::FileExists(geneProbFile.c_str())){ + consoleLog->error("alevin-mode is invoked with --dbg but neither bfh file" + " no probability files are produced" + ); + std::exit(5) ; + } eqClassPtr->loadProbability( geneProbFile, refInfo, @@ -810,29 +818,17 @@ void DataMatrix::loadAlevinData( ) ; } - // std::cerr << "Read BFH file \n" << std::flush ; - consoleLog->info("Parsed BFH file related information...") ; - consoleLog->info("Genes in BFH: {}", numOfGenes) ; - - // std::cerr << numOfGenes << "\n" << " alven2refMap.size(): " << alevin2refMap.size() << "\n" << std::flush ; - // std::cerr << " Space to be allocated \n" << std::flush ; - + consoleLog->info("Parsed BFH/Prob file related information...") ; + consoleLog->info("Genes in BFH: {}", numOfGenes) ; preCalculatedSegProb.resize(numOfGenes) ; // per gene preSegOreMapVector.resize(numOfGenes) ; geneSpecificTrVector.resize(numOfGenes) ; - // std::cerr << " Space allocated \n" << std::flush ; - // fill segment based probability // gene to tr to prob - - - //uint32_t trackGid{54819} ; + // uint32_t trackGid{54819} ; for(uint32_t i = 0; i < numOfGenes; ++i){ - - // std::cerr << i << " gene\n" << std::flush ; - auto it = alevin2refMap.find(i) ; if(it != alevin2refMap.end()){ auto originalGeneId = it->second ; @@ -858,11 +854,12 @@ void DataMatrix::loadAlevinData( for(auto tInfo : tcInfoVec){ if(refInfo.transcripts[tid].RefLength - tInfo.eposInContig <= MAX_FRAGLENGTH){ if(tInfo.eposInContig - tInfo.sposInContig < READ_LEN){ - consoleLog->info("seg id: {} \t {}",tInfo.eposInContig,tInfo.sposInContig) ; + consoleLog->error("encountered a contig shorter than read length", + "this is not permitted currently" + ); + consoleLog->error("seg id: {} \t {}",tInfo.eposInContig,tInfo.sposInContig) ; std::exit(6) ; } - - localGeneProb[seg] = bfhCount ; localTrVector[seg].emplace_back( tid, @@ -884,7 +881,6 @@ void DataMatrix::loadAlevinData( } } } - preCalculatedSegProb[i] = localGeneProb ; preSegOreMapVector[i] = localSegOreMap ; geneSpecificTrVector[i] = localTrVector ; @@ -896,7 +892,7 @@ void DataMatrix::loadAlevinData( } - { + { size_t nonWhiteLisBarcodesSkipped{0} ; @@ -925,20 +921,21 @@ void DataMatrix::loadAlevinData( // level counts. It is a multinomial distribution // to begin with where gene counts are of type double + consoleLog->info("=======================Parsing the binary matrix file======================") ; if(!binary){ pupulateGeneCountMatrix(countFile, geneCounts, numCells, numOfGenes) ; }else{ - std::cout << "\n" ; + //std::cout << "\n" ; consoleLog->info("After gathering all information about the matrix loading the binary Matrix") ; populateGeneCountMatrix(countFileBinary, originalGeneCountMatrix, numCells, numOfOriginalGenes, original2whitelistMap, original2NoisyMap, numOfOriginalCells) ; } consoleLog->info("Loaded the Matrix") ; - // Make a truncated geneCounts file //T skippedCount{0} ; if(numOfSkippedGenes > 0){ + consoleLog->info("Truncating the matrix as not all genes are included") ; for(size_t cell_id = 0 ; cell_id < originalGeneCountMatrix.size(); ++cell_id){ size_t geneCountsGeneIdx{0} ; for(size_t gene_id = 0 ; gene_id < numOfOriginalGenes ; ++ gene_id){ @@ -952,10 +949,8 @@ void DataMatrix::loadAlevinData( geneCounts = originalGeneCountMatrix ; } - consoleLog->info("Truncated the matrix ") ; - std::cout<<"===============================================================\n\n" ; - // NOTE: This feature is not tested yet + // NOTE: This feature is not tested yet // CREATE Doublets if(createDoublet){ // Treat doublets as normal cells and put them in the list of all cells @@ -998,7 +993,7 @@ void DataMatrix::loadAlevinData( size_t numOfExpressedGenesInput{0}; //size_t numOfExpressedGenesOutput{0} ; - consoleLog->info("We start to prepare Cell-Transcript matrix"); + consoleLog->info("We start to prepare Cell-Transcript matrix"); for(auto& cellGeneCounts : geneCounts){ // check if this cell Id is in cellWhiteListMap @@ -1093,48 +1088,9 @@ void DataMatrix::loadAlevinData( for(int i = 0; i < totGeneCount ; ++i){ ++cellGeneCountSampled[dg(geng)] ; } - //std::cerr << "[DEBUG] --> After multinomial sampling\n" ; - //std::cerr << "[DEBUG] --> cellGeneCountSampled size "<< cellGeneCountSampled.size() <<"\n" ; + trueGeneCounts[cellId].assign(cellGeneCountSampled.begin(), cellGeneCountSampled.end()) ; - //size_t numOfExpressedGenes2{0} ; - //size_t numOfExpressedGenes3{0} ; - - //// before sampling - //for(auto v : cellGeneCountsCopy){ - - // if(v > 0){ - // numOfExpressedGenes3 += 1 ; - // } - //} - - - //for(auto v : cellGeneCountSampled){ - // if(v > 0){ - // numOfExpressedGenes2 += 1 ; - // } - //} - - //double l1_diff{0.0} ; - //double l2_diff{0.0} ; - - - //for(size_t i = 0; i < cellGeneCountSampled.size() ; ++i){ - // l1_diff += std::abs( - // static_cast(cellGeneCountSampled[i]) - - // cellGeneCountsCopy[i] - // ) ; - - // l2_diff += pow(( - // static_cast(cellGeneCountSampled[i]) - - // cellGeneCountsCopy[i] - // ),2) ; - //} - - //int totGeneCountRecheck = static_cast(std::accumulate(cellGeneCountSampled.begin(), cellGeneCountSampled.end(), 0)) ; - - - //diffFileStream << l1_diff << "\t" << std::fixed << std::setprecision(5) << "\t" // << std::sqrt(l2_diff) << std::fixed << std::setprecision(5) << "\t" // << totGeneCountDouble << "\t" @@ -1233,14 +1189,11 @@ void DataMatrix::loadAlevinData( } cellSegCount[actualCellId][i] = segCountMap ; - - //cell2GeneNameMap[actualCellId].append() - } } else{ - // In alevin mode if you are not using dbg file then by default - // the weibull distribution will be invoked. + // In alevin mode if you are not using dbg file then by default + // the weibull distribution will be invoked. // These are the learned distribution weibull paeameters from the paper // https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1005761 // Assign probability to individual transcripts @@ -1269,16 +1222,16 @@ void DataMatrix::loadAlevinData( //NOTE: If dbg is used we don't work with transcripts trueGeneCounts[cellId][i] = geneCount ; - // NOTE: In DBG mode we should be done by this - // point + // NOTE: In DBG mode we should be done by this + // point if(useDBG){ continue ; } - // NOTE: For weibull distribution the probability - // vector for each transcript could be 0 in which - // case we drop the gene altogether + // NOTE: For weibull distribution the probability + // vector for each transcript could be 0 in which + // case we drop the gene altogether auto sum_prob = std::accumulate(probVec.begin(), probVec.end(), 0.0) ; if(sum_prob == 0){ dropThisGene = true ; @@ -1290,8 +1243,8 @@ void DataMatrix::loadAlevinData( continue ; } - // Otherwise we distribute the gene counts to - // individual transcripts + // Otherwise we distribute the gene counts to + // individual transcripts std::random_device rdt3; std::mt19937 gent3(rdt3()); std::discrete_distribution<> dmt(probVec.begin(), probVec.end()) ; @@ -1371,11 +1324,18 @@ void DataMatrix::loadAlevinData( // FIXME: this should not be ad-hoc numCells = geneCounts.size() ; - if(!useDBG){ - consoleLog->info("The transcript matrix is constructed, with dimension {} x {} \n" - "\t\t\t\t\tfrom gene count with dimention {} x {}, geneCounts.size(): {}", - data.size(), data[0].size(), cellNames.size(), alevin2refMap.size(), geneCounts.size()) ; - } + if(!useDBG){ + consoleLog->info("The transcript matrix is constructed, with dimension {} x {} \n" + "\t\t\t\t\tfrom gene count with dimention {} x {}, geneCounts.size(): {}", + data.size(), data[0].size(), cellNames.size(), alevin2refMap.size(), geneCounts.size()) ; + } + + std::string cellColFile = outDir + "/alevin/quants_mat_cols.txt" ; + std::ofstream cellColStream(cellColFile.c_str()) ; + + for(size_t id = 0 ; id < alevinGeneIndex2NameMap.size() ; ++id){ + cellColStream << alevinGeneIndex2NameMap[id] << "\n" ; + } } @@ -1707,8 +1667,8 @@ void DataMatrix::loadSplatterData( std::cout<<"==================Done Parsing Splatter Matrix==================\n" ; - std::string gene_name_to_track = "ENSG00000001084.13"; - size_t geneIdToTrack ; + std::string gene_name_to_track = ""; // "ENSG00000001084.13"; + size_t geneIdToTrack = std::numeric_limits::max() ; consoleLog->info( "Splatter matrix is read, with dimension {} x {}", @@ -1825,7 +1785,7 @@ void DataMatrix::loadSplatterData( } // Load transcripts that are present in the gfa file - dbgPtr = new GFAReader(gfaFile) ; + dbgPtr = new GFAReader(gfaFile, consoleLog) ; dbgPtr->parseFile(refInfo) ; diff --git a/src/Minnow.cpp b/src/Minnow.cpp index aed0cd9..857f72f 100644 --- a/src/Minnow.cpp +++ b/src/Minnow.cpp @@ -37,7 +37,25 @@ int main(int argc, char* argv[]) { } return true; }; + + auto ensure_input_dir = [](const std::string& inputdir) -> bool { + if (ghc::filesystem::is_directory(inputdir)){ + auto c = inputdir + "/quants_mat_cols.txt"; + auto r = inputdir + "/quants_mat_rows.txt"; + if(!ghc::filesystem::exists(c) || !ghc::filesystem::exists(r)){ + std::string e = "Either " + c + " or " + r + " does not exist" \ + + "check the " + inputdir ; + throw std::runtime_error{e}; + } + }else{ + std::string e = "The input directory" + inputdir + "does not exist"; + throw std::runtime_error{e}; + } + return true ; + }; + + std::vector indexWrong; auto indexMode = ( command("index").set(selected, mode::index), (required("-r", "--ref") & values(ensure_file_exists, "ref_file", indexOpt.rfile)) % "path to the reference fasta file", @@ -45,7 +63,8 @@ int main(int argc, char* argv[]) { (option("-f", "--filt-size") & value("filt_size", indexOpt.filt_size)) % "filter size to pass to TwoPaCo when building the reference dBG", (option("--tmpdir") & value("twopaco_tmp_dir", indexOpt.twopaco_tmp_dir)) % "temporary work directory to pass to TwoPaCo when building the reference dBG", (option("-k", "--klen") & value("kmer_length", indexOpt.k)) % "length of the k-mer with which the dBG was built (default = 101)", - (option("-p", "--threads") & value("threads", indexOpt.p)) % "total number of threads to use for building MPHF (default = 16)" + (option("-p", "--threads") & value("threads", indexOpt.p)) % "total number of threads to use for building MPHF (default = 16)", + any_other(indexWrong) ); auto estimateMode = ( @@ -74,90 +93,70 @@ int main(int argc, char* argv[]) { ); + std::vector simulateWrong; auto simulateMode = ( command("simulate").set(selected, mode::simulate), - // required options - - (required("-i", "--inputdir") & - value("inputdir", simulateOpt.inputdir)) % - "directory with matrix file/ if this is a file instead of a dir", + // it can either be --splatter-mode or --alevin mode + // but not both + required("--alevin-mode").set(simulateOpt.alevinMode, true) | + required("--splatter-mode").set(simulateOpt.splatterMode, true) + .if_missing ( [] { std::cerr << "\n\033[31mNone of the " + << "--alevin-mode " + << "--splatter-mode " + << "are specified\033[0m \n\n"; } ) + .if_conflicted( [] { std::cerr << "\n\033[31mUse either of " + << "two modes" + << " but not both\033[0m\n\n"; } ), - (required("-o", "--outdir") & - value("mat_file", simulateOpt.outDir)) % - "the simulated reads will be written here", + // required options + (required("-i", "--inputdir") + .if_missing ([] {std::cerr << "\033[31mrequired option -i/--inputdir missing\033[0m\n\n";}) + & value(ensure_input_dir,"inputdir", simulateOpt.inputdir)) + % "directory with matrix file/ if this is a file instead of a dir", + (required("-r", "--reffile") + .if_missing ([] {std::cerr << "\033[31mrequired option -r/--reffile missing\033[0m\n\n";}) + & value(ensure_file_exists, "ref_file", simulateOpt.refFile)) % "transcriptome reference file (assumed from fasta file)", + (required("-w", "--whitelistFile") + .if_missing ([] {std::cerr << "\033[31mrequired option -w/--whitelistFile missing\033[0m\n\n";}) + & value(ensure_file_exists, "whitelist_file", simulateOpt.whitelistFile)) % "10X provided cell barcodes, generally named as 737K-august-2016.txt", + (required("-o", "--outdir") + .if_missing ([] {std::cerr << "\033[31m required option -o/--outdir missing \033[0m\n\n";}) + & value("output directory", simulateOpt.outDir)) % "the simulated reads will be written here", + (required("--t2g") | required("--g2t") + .if_missing ([] {std::cerr << "\033[31m required option --t2g missing \033[0m \n\n";}) + & value(ensure_file_exists, "gene_tr", simulateOpt.gene2txpFile)) % "tab separated list of Gene to Transcirpt mapping", - (required("-r", "--reffile") & - value("ref_file", simulateOpt.refFile)) % - "transcriptome reference file (assumed from fasta file)", - (required("-w", "--whitelistFile") & - value("whitelist_file", simulateOpt.whitelistFile)) % - "transcriptome reference file (assumed from fasta file)", - - (option("--metadataDir") & - value("num mol file", simulateOpt.metadataDir)) % - "A directory containing metadata files in case the user defined files don't exit", - - - (option("--numMolFile") & - value("num mol file", simulateOpt.numMolFile)) % - "Number of molecules generated from each cell", - - (option("--CBLength") & - value("Cell barcode length", simulateOpt.CBLength)) % - "Cell barcode length by default is 16", + (option("--metadataDir") & value("meta data folder", simulateOpt.metadataDir)) % "A directory containing metadata files in case the user defined files don't exit", + (option("--numMolFile") & value("num mol file", simulateOpt.numMolFile)) % "Number of molecules generated from each cell", - (option("--UMILength") & - value("Cell barcode length", simulateOpt.UMILength)) % - "Cell barcode length by default is 10", - - (option("--ReadLength") & - value("Read length", simulateOpt.UMILength)) % - "read length by default is 100", + (option("--CBLength") & value("Cell barcode length", simulateOpt.CBLength)) % "Cell barcode length by default is 16", + (option("--UMILength") & value("Cell barcode length", simulateOpt.UMILength)) % "Cell barcode length by default is 10", + (option("--ReadLength") & value("Read length", simulateOpt.UMILength)) % "read length by default is 100", - (option("--alevin-mode").set(simulateOpt.alevinMode, true)) % - "The program would assume that the input matrix is obtained from Alevin", + // (option("--alevin-mode").set(simulateOpt.alevinMode, true)) % + // "The program would assume that the input matrix is obtained from Alevin", - (option("--splatter-mode").set(simulateOpt.splatterMode, true)) % - "matrix file is obtained from running splatter", - - (option("--custom").set(simulateOpt.customNames, true)) % - "Read custom gene names instead of assigning genes creatively", + // (option("--splatter-mode").set(simulateOpt.splatterMode, true)) % + // "matrix file is obtained from running splatter", - - (option("--normal-mode").set(simulateOpt.normalMode, true)) % - "user provided matrix", + (option("--custom").set(simulateOpt.customNames, true)) % "Read custom gene names instead of assigning genes creatively", - (option("--testUniqness").set(simulateOpt.testUniqness, true)) % - "matrix file is obtained from running splatter", - - - (option("--reverseUniqness").set(simulateOpt.reverseUniqness, true)) % - "matrix file is obtained from running splatter", - - (option("--useWeibull").set(simulateOpt.useWeibull, true)) % - "matrix file is obtained from running splatter", + // (option("--normal-mode").set(simulateOpt.normalMode, true)) %"user provided matrix", + (option("--testUniqness").set(simulateOpt.testUniqness, true)) % "matrix file is obtained from running splatter", + (option("--reverseUniqness").set(simulateOpt.reverseUniqness, true)) % "matrix file is obtained from running splatter", + (option("--useWeibull").set(simulateOpt.useWeibull, true)) % "matrix file is obtained from running splatter", - (option("--numOfDoublets") & - value("number of Doublets", simulateOpt.numOfDoublets)) % - "Number of doublets to be generated", - (option("--gencode").set(simulateOpt.gencode, true)) % - "gencode reference has | separator", + (option("--numOfDoublets") & value("number of Doublets", simulateOpt.numOfDoublets)) % "Number of doublets to be generated", - (option("--g2t") & - value("gene_tr", simulateOpt.gene2txpFile)) % - "tab separated list of Gene to Transcirpt mapping", + (option("--gencode").set(simulateOpt.gencode, true)) % "gencode reference has | separator", - (option("--rspd") & - value("rspd_dist", simulateOpt.rspdFile)) % - "tab separated read start position distribution", + (option("--rspd") & value("rspd_dist", simulateOpt.rspdFile)) % "tab separated read start position distribution", - (option("--bfh") & - value("BFH file", simulateOpt.bfhFile)) % - "BFH file", + (option("--bfh") & value(ensure_file_exists, "BFH file", simulateOpt.bfhFile)) % "BFH file", (option("--geneProb") & value("gene level probability", simulateOpt.geneProbFile)) % @@ -179,13 +178,9 @@ int main(int argc, char* argv[]) { (option("--noDump").set(simulateOpt.noDump, true)) % "will use the model file made", - (option("--gfa") & - value("gfa_file", simulateOpt.gfaFile)) % - "gfa file for contigs", + (option("--gfa") & value(ensure_file_exists, "gfa_file", simulateOpt.gfaFile)) % "gfa file for contigs", - (option("--uniq") & - value("sequence uniqueness file", simulateOpt.uniquenessFile)) % - "sequence uniqueness file", + (option("--uniq") & value("sequence uniqueness file", simulateOpt.uniquenessFile)) % "sequence uniqueness file", (option("--illum") & value("illumina model", simulateOpt.illuminaModelFile)) % @@ -259,9 +254,12 @@ int main(int argc, char* argv[]) { (option("-p", "--num-threads") & value("number of threads", simulateOpt.numThreads)) % - "number of threads to parallelize the process" + "number of threads to parallelize the process", + + any_other(simulateWrong) ); + std::vector wrong; auto cli = ( (simulateMode | estimateMode | @@ -270,7 +268,8 @@ int main(int argc, char* argv[]) { command("-h").set(selected,mode::help) | command("help").set(selected,mode::help) ), - option("-v", "--version").call([]{std::cout << "version 0.1.0\n\n";}).doc("show version") + option("-v", "--version").call([]{std::cout << "version 0.1.0\n\n";}).doc("show version"), + any_other(wrong) ); decltype(parse(argc, argv, cli)) res; @@ -297,7 +296,7 @@ int main(int argc, char* argv[]) { if(b->arg() == "index"){ std::cout << make_man_page(indexMode, "minnow") ; }else if(b->arg() == "simulate"){ - std::cout << make_man_page(simulateMode, "minnow") ; + std::cout << usage_lines(simulateMode, "minnow") << "\n" ; }else{ std::cout << "There is no command \"" << b->arg() << "\"\n" ; std::cout << usage_lines(cli, "minnow") << '\n'; diff --git a/src/MinnowValidator.cpp b/src/MinnowValidator.cpp index 214a391..ed2f0e6 100644 --- a/src/MinnowValidator.cpp +++ b/src/MinnowValidator.cpp @@ -6,6 +6,11 @@ #include "GFAReader.hpp" #include "MinnowUtil.hpp" +#include "spdlog/spdlog.h" +#include "spdlog/sinks/ostream_sink.h" +#include "spdlog/fmt/ostr.h" +#include "spdlog/fmt/fmt.h" + #include #include #include @@ -398,10 +403,11 @@ void gfaValidate( std::string& gfaFile, std::string& fastqFile, int& edit_max_lim, - std::string& outFile + std::string& outFile, + std::shared_ptr& consoleLog ){ - GFAReader gfaObj(gfaFile) ; + GFAReader gfaObj(gfaFile, consoleLog) ; gfaObj.readUnitigs() ; @@ -487,6 +493,10 @@ void gfaValidate( } void validate(ValidateOpt& valOpts){ + // Set up logger + auto consoleSink = std::make_shared() ; + auto consoleLog = spdlog::create("minnow-Log", {consoleSink}); + if(valOpts.gfaFile == ""){ std::cerr << "\n Running ref validate \n" ; @@ -501,7 +511,8 @@ void validate(ValidateOpt& valOpts){ valOpts.gfaFile, valOpts.fastqFile, valOpts.edit_max_lim, - valOpts.outFile + valOpts.outFile, + consoleLog ) ; }