From 8e4786b411c7641555ca907844ad075571a8e2b0 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Tue, 1 Dec 2015 20:16:28 -0800 Subject: [PATCH 01/19] Alt mapping - initial sketch in single & paired aligner - WIP --- SNAPLib/BaseAligner.cpp | 69 +++++++++++++- SNAPLib/FASTA.h | 8 -- SNAPLib/Genome.h | 12 ++- SNAPLib/GenomeIndex.cpp | 33 +++++++ SNAPLib/GenomeIndex.h | 7 ++ SNAPLib/IntersectingPairedEndAligner.cpp | 112 ++++++++++++++++------- SNAPLib/IntersectingPairedEndAligner.h | 37 ++++++-- 7 files changed, 228 insertions(+), 50 deletions(-) diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index eb9fb4cb..02efa6c0 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -44,6 +44,27 @@ using std::min; #define TRACE(...) {} #endif + +typedef struct MatchInfo +{ + GenomeLocation location; + GenomeLocation liftedLocation; + double matchProbability; + + MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) : + location(_loc), liftedLocation(_lifted), matchProbability(_p) {} +} MatchInfo; + +bool +matchInfoComparator( + const MatchInfo& a, + const MatchInfo& b) +{ + return a.liftedLocation < b.liftedLocation; +} + +typedef VariableSizeVector MatchInfoVector; + BaseAligner::BaseAligner( GenomeIndex *i_genomeIndex, unsigned i_maxHitsToConsider, @@ -652,6 +673,37 @@ Return Value: return; } + /** + * Add up the highest-probability matches of all overlapping alternates + */ + double +computeLiftedCandidateProbability( + MatchInfoVector* allMatches, + GenomeDistance length) +{ + std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator); + double totalProbability = 0.0; + MatchInfo best(0, 0, 0); + GenomeLocation farthest; + for (int i = 0; i <= allMatches->size(); i++) { + MatchInfo m(0, 0, 0); + if (i == allMatches->size() || (m = (*allMatches)[i]).liftedLocation > farthest) { + totalProbability += best.matchProbability; + best = m; + } + else { + if (m.matchProbability > best.matchProbability) { + best = m; + } + GenomeLocation e = m.liftedLocation + length - 1; + if (e > farthest) { + farthest = e; + } + } + } + return totalProbability; +} + bool BaseAligner::score( bool forceResult, @@ -744,6 +796,11 @@ Return Value: #endif unsigned weightListToCheck = highestUsedWeightList; + MatchInfoVector* allMatches = NULL; + bool anyAltMatches = FALSE; + if (genome->hasAltContigs()) { + allMatches = new MatchInfoVector(); + } do { // @@ -764,6 +821,9 @@ Return Value: primaryResult->score = bestScore; if (bestScore <= maxK) { primaryResult->location = bestScoreGenomeLocation; + if (anyAltMatches) { + probabilityOfAllCandidates = computeLiftedCandidateProbability(allMatches, read[0]->getDataLength()); + } primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped); if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) { primaryResult->status = SingleHit; @@ -913,6 +973,14 @@ Return Value: // We could mark as scored anything in between the old and new genome offsets, but it's probably not worth the effort since this is // so rare and all it would do is same time. // + + // remember in case there are alt matches + if (allMatches != NULL) { + if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) { + anyAltMatches = TRUE; + } + allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability)); + } } } } else { // if we had genome data to compare against @@ -1114,7 +1182,6 @@ Return Value: return false; } - void BaseAligner::prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction) { diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h index 6e542f32..44cb810f 100644 --- a/SNAPLib/FASTA.h +++ b/SNAPLib/FASTA.h @@ -39,11 +39,3 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool AppendFASTAGenome(const Genome *, FILE *fasta); - -// -// This is arbitrary; is there some existing convention? -// -inline const char *diploidFASTASexPrefix(bool male) -{ - return male ? "PATERNAL|" : "MATERNAL|"; -} diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index 84e94f52..d65d9008 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -245,9 +245,15 @@ class Genome { } struct Contig { - Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL) {} + Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), + isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {} GenomeLocation beginningLocation; GenomeDistance length; + bool isAlternate; + int altGroup; // each group of overlapping alt regions is given a unique ID + bool isReverseStrand; // if reversed alternate strand + GenomeLocation liftedLocation; // location of beginning of alt contig mapping to primary + GenomeLocation contextBefore, contextAfter; // context sequence added from primary (alts near ends have less context) unsigned nameLength; char *name; }; @@ -261,6 +267,10 @@ class Genome { const Contig *getNextContigAfterLocation(GenomeLocation location) const; int getContigNumAtLocation(GenomeLocation location) const; // Returns the contig number, which runs from 0 .. getNumContigs() - 1. + inline bool hasAltContigs() const { return FALSE; } // todo: implement + + GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation; } // todo: implement + // unused Genome *copy() const {return copy(true,true,true);} // unused Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);} diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 2f34ba82..72f05bd6 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -1915,6 +1915,22 @@ GenomeIndex::lookupSeed32( } } + + void +GenomeIndex::lookupSeedAlt32( + Seed seed, + _int64 *nHits, + const unsigned **hits, + _int64 *nRCHits, + const unsigned **rcHits, + const unsigned **unliftedHits, + const unsigned **unliftedRCHits) +{ + lookupSeed32(seed, nHits, hits, nRCHits, rcHits); + *unliftedHits = *hits; + *unliftedRCHits = *rcHits; +} + void GenomeIndex::fillInLookedUpResults32( const unsigned *subEntry, @@ -2041,6 +2057,23 @@ GenomeIndex::lookupSeed( } } + void +GenomeIndex::lookupSeedAlt( + Seed seed, + _int64 * nHits, + const GenomeLocation ** hits, + _int64 * nRCHits, + const GenomeLocation ** rcHits, + const GenomeLocation ** unliftedHits, + const GenomeLocation ** unliftedRCHits, + GenomeLocation * singleHit, + GenomeLocation * singleRCHit) +{ + // todo: implement + lookupSeed(seed, nHits, hits, nRCHits, rcHits, singleHit, singleRCHit); + *unliftedHits = *hits; + *unliftedRCHits = *rcHits; +} void GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index 21654618..2cdd0782 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -50,6 +50,13 @@ class GenomeIndex { void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit); void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits); + // versions for genome that has alt regions + // hits/rcHits locations are lifted to non-alt contigs, unliftedHits/unliftedRCHits are original locations in alt contigs + // nHits/nRCHits is the same for both sets + // *hits==*unliftedHits && *rcHits==*unliftedRCHits iff seed has no alt hits + void lookupSeedAlt(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, const GenomeLocation **unliftedHits, const GenomeLocation **unliftedRCHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit); + void lookupSeedAlt32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits, const unsigned **unliftedHits, const unsigned **unliftedRCHits); + bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;} // diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index be94b4ec..a0d1c759 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -335,12 +335,26 @@ IntersectingPairedEndAligner::align( _int64 nHits[NUM_DIRECTIONS]; const GenomeLocation *hits[NUM_DIRECTIONS]; const unsigned *hits32[NUM_DIRECTIONS]; + const GenomeLocation *unliftedHits[NUM_DIRECTIONS]; + const unsigned *unliftedHits32[NUM_DIRECTIONS]; - if (doesGenomeIndexHave64BitLocations) { - index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], - hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); - } else { - index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]); + if (!doesGenomeIndexHaveAlts) { + if (doesGenomeIndexHave64BitLocations) { + index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], + hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); + } + else { + index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]); + } + } + else { + if (doesGenomeIndexHave64BitLocations) { + index->lookupSeedAlt(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &unliftedHits[FORWARD], &unliftedHits[RC], + hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); + } + else { + index->lookupSeedAlt32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC], &unliftedHits32[FORWARD], &unliftedHits32[RC]); + } } countOfHashTableLookups[whichRead]++; @@ -353,10 +367,21 @@ IntersectingPairedEndAligner::align( } if (nHits[dir] < maxBigHits) { totalHashTableHits[whichRead][dir] += nHits[dir]; - if (doesGenomeIndexHave64BitLocations) { - hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], beginsDisjointHitSet[dir]); - } else { - hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], beginsDisjointHitSet[dir]); + if (!doesGenomeIndexHaveAlts) { + if (doesGenomeIndexHave64BitLocations) { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], NULL, beginsDisjointHitSet[dir]); + } + else { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], NULL, beginsDisjointHitSet[dir]); + } + } + else { + if (doesGenomeIndexHave64BitLocations) { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], unliftedHits[dir], beginsDisjointHitSet[dir]); + } + else { + hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], unliftedHits32[dir], beginsDisjointHitSet[dir]); + } } beginsDisjointHitSet[dir]= false; } else { @@ -388,7 +413,6 @@ IntersectingPairedEndAligner::align( Direction setPairDirection[NUM_SET_PAIRS][NUM_READS_PER_PAIR] = {{FORWARD, RC}, {RC, FORWARD}}; - // // Phase 2: find all possible candidates and add them to candidate lists (for the reads with fewer and more hits). // @@ -409,6 +433,10 @@ IntersectingPairedEndAligner::align( unsigned lastSeedOffsetForReadWithFewerHits; GenomeLocation lastGenomeLocationForReadWithFewerHits; GenomeLocation lastGenomeLocationForReadWithMoreHits; + GenomeLocation lastUnliftedGenomeLocationForReadWithFewerHits; + GenomeLocation lastUnliftedGenomeLocationForReadWithMoreHits; + GenomeLocation *pLastGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL; + GenomeLocation *pLastGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL; unsigned lastSeedOffsetForReadWithMoreHits; bool outOfMoreHitsLocations = false; @@ -416,7 +444,7 @@ IntersectingPairedEndAligner::align( // // Seed the intersection state by doing a first lookup. // - if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) { + if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { // // No hits in this direction. // @@ -444,7 +472,7 @@ IntersectingPairedEndAligner::align( // location that's not too high. // if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing, - &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) { + &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) { break; // End of all of the mates. We're done with this set pair. } } @@ -463,7 +491,7 @@ IntersectingPairedEndAligner::align( } if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits, - &lastSeedOffsetForReadWithFewerHits)) { + &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { // // No more candidates on the read with fewer hits side. We're done with this set pair. // @@ -490,7 +518,8 @@ IntersectingPairedEndAligner::align( soft_exit(1); } scoringMateCandidates[whichSetPair][lowestFreeScoringMateCandidate[whichSetPair]].init( - lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits); + lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits, + doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithMoreHits : lastGenomeLocationForReadWithMoreHits); #ifdef _DEBUG if (_DumpAlignments) { @@ -505,7 +534,7 @@ IntersectingPairedEndAligner::align( previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits; - if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) { + if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) { lastGenomeLocationForReadWithMoreHits = 0; outOfMoreHitsLocations = true; break; // out of the loop looking for candidates on the more hits side. @@ -550,7 +579,8 @@ IntersectingPairedEndAligner::align( scoringCandidatePool[lowestFreeScoringCandidatePoolEntry].init(lastGenomeLocationForReadWithFewerHits, whichSetPair, lowestFreeScoringMateCandidate[whichSetPair] - 1, lastSeedOffsetForReadWithFewerHits, bestPossibleScoreForReadWithFewerHits, - scoringCandidates[bestPossibleScore]); + scoringCandidates[bestPossibleScore], + doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithFewerHits : lastGenomeLocationForReadWithFewerHits); scoringCandidates[bestPossibleScore] = &scoringCandidatePool[lowestFreeScoringCandidatePoolEntry]; @@ -568,7 +598,7 @@ IntersectingPairedEndAligner::align( maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore); } - if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) { + if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { break; } } @@ -602,7 +632,7 @@ IntersectingPairedEndAligner::align( double fewerEndMatchProbability; int fewerEndGenomeLocationOffset; - scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsGenomeLocation, + scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset); _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); @@ -635,7 +665,7 @@ IntersectingPairedEndAligner::align( // use now, score it. // if (mate->score == -2 || mate->score == -1 && mate->scoreLimit < scoreLimit - fewerEndScore) { - scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsGenomeLocation, + scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsUnliftedGenomeLocation, mate->seedOffset, scoreLimit - fewerEndScore, &mate->score, &mate->matchProbability, &mate->genomeOffset); #ifdef _DEBUG @@ -654,6 +684,16 @@ IntersectingPairedEndAligner::align( if (mate->score != -1) { double pairProbability = mate->matchProbability * fewerEndMatchProbability; unsigned pairScore = mate->score + fewerEndScore; + + // reduce probability of pairs matching across different overlapping alts + // todo: assuming if they're on different alts within maxSpacing they overlap - true for GRCh38 but not necessarily for all genomes + // use crossover probability with 1 centiMorgan ~= 1Mbp + if (doesGenomeIndexHaveAlts && isBothAltPairMapping(candidate, mate) && + abs(mate->readWithMoreHitsUnliftedGenomeLocation - candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) + { + pairProbability *= 1e-8 * abs(candidate->readWithFewerHitsGenomeLocation - mate->readWithMoreHitsGenomeLocation); + } + // // See if this should be ignored as a merge, or if we need to back out a previously scored location // because it's a worse version of this location. @@ -713,7 +753,7 @@ IntersectingPairedEndAligner::align( candidate->mergeAnchor = mergeAnchor; } else { merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, - pairProbability, pairScore, &oldPairProbability); + pairProbability, pairScore, doesGenomeIndexHaveAlts && isNonAltPairMapping(candidate, mate), &oldPairProbability); } if (!merged) { @@ -727,7 +767,8 @@ IntersectingPairedEndAligner::align( bool isBestHit = false; if (pairScore <= maxK && (pairScore < bestPairScore || - (pairScore == bestPairScore && pairProbability > probabilityOfBestPair))) { + (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair || + (pairProbability == probabilityOfBestPair && isNonAltPairMapping(candidate, mate)))))) { // // A new best hit. // @@ -759,8 +800,8 @@ IntersectingPairedEndAligner::align( } bestPairScore = pairScore; probabilityOfBestPair = pairProbability; - bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset; - bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset; + bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset; + bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset; bestResultScore[readWithFewerHits] = fewerEndScore; bestResultScore[readWithMoreHits] = mate->score; bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits]; @@ -786,8 +827,8 @@ IntersectingPairedEndAligner::align( result->direction[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits]; result->direction[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits]; result->fromAlignTogether = true; - result->location[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset; - result->location[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset; + result->location[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset; + result->location[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset; result->mapq[0] = result->mapq[1] = 0; result->score[readWithMoreHits] = mate->score; result->score[readWithFewerHits] = fewerEndScore; @@ -1091,7 +1132,7 @@ IntersectingPairedEndAligner::HashTableHitSet::init() #define RL(lookups, glType, lookupListHead) \ void \ -IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, bool beginsDisjointHitSet) \ +IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, const glType *unliftedHits, bool beginsDisjointHitSet) \ { \ _ASSERT(nLookupsUsed < maxSeeds); \ if (beginsDisjointHitSet) { \ @@ -1106,6 +1147,7 @@ IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _ASSERT(currentDisjointHitSet != -1); /* Essentially that beginsDisjointHitSet is set for the first recordLookup call */ \ lookups[nLookupsUsed].currentHitForIntersection = 0; \ lookups[nLookupsUsed].hits = hits; \ + lookups[nLookupsUsed].unliftedHits = unliftedHits; \ lookups[nLookupsUsed].nHits = nHits; \ lookups[nLookupsUsed].seedOffset = seedOffset; \ lookups[nLookupsUsed].whichDisjointHitSet = currentDisjointHitSet; \ @@ -1181,7 +1223,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren } bool -IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound) + IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound) { bool anyFound = false; @@ -1238,6 +1280,10 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom if (probeHit - seedOffset > bestLocationFound) { anyFound = true; mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; + if (actualUnliftedGenomeLocationFound != NULL) { + *actualUnliftedGenomeLocationFound = doesGenomeIndexHave64BitLocations + ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]; + } *seedOffsetFound = seedOffset; } @@ -1273,7 +1319,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom bool -IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound) + IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) { bool anyFound = false; *genomeLocation = 0; @@ -1286,6 +1332,9 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom for (unsigned i = 0; i < nLookupsUsed; i++) { \ if (lookups[i].nHits > 0 && lookups[i].hits[0] - lookups[i].seedOffset > GenomeLocationAsInt64(*genomeLocation)) { \ mostRecentLocationReturned = *genomeLocation = lookups[i].hits[0] - lookups[i].seedOffset; \ + if (unliftedGenomeLocation != NULL) { \ + *unliftedGenomeLocation = lookups[i].unliftedHits[0] - lookups[i].seedOffset; \ + } \ *seedOffsetFound = lookups[i].seedOffset; \ anyFound = true; \ } \ @@ -1303,7 +1352,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom } bool -IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound) + IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) { // // Look through all of the lookups and find the one with the highest location smaller than the current one. @@ -1373,7 +1422,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g } bool -IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, +IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt, double *oldMatchProbability) { if (locationForReadWithMoreHits == InvalidGenomeLocation || !doesRangeMatch(newMoreHitLocation, newFewerHitLocation)) { @@ -1390,7 +1439,8 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL // // Within merge distance. Keep the better score (or if they're tied the better match probability). // - if (newPairScore < pairScore || newPairScore == pairScore && newMatchProbability > matchProbability) { + if (newPairScore < pairScore || (newPairScore == pairScore && + (newMatchProbability > matchProbability || (newMatchProbability == matchProbability && newPairIsNonAlt)))) { #ifdef _DEBUG if (_DumpAlignments) { printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n", diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h index 9bc6029e..e951f64b 100644 --- a/SNAPLib/IntersectingPairedEndAligner.h +++ b/SNAPLib/IntersectingPairedEndAligner.h @@ -134,6 +134,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner unsigned maxSpacing; unsigned seedLen; bool doesGenomeIndexHave64BitLocations; + bool doesGenomeIndexHaveAlts; _int64 nLocationsScored; bool noUkkonen; bool noOrderedEvaluation; @@ -149,6 +150,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner unsigned seedOffset; _int64 nHits; const GL * hits; + const GL * unliftedHits; unsigned whichDisjointHitSet; // @@ -185,7 +187,8 @@ class IntersectingPairedEndAligner : public PairedEndAligner // provide the lookup function a place to write the result. Since we need one per // lookup, it goes here. // - GL singletonGenomeLocation[2]; // The [2] is because we need to look one before sometimes, and that allows space + GL singletonGenomeLocation[4]; // The [4] is because we need to look one before sometimes, and that allows space + // also to allow space for unlifted locations }; // @@ -211,26 +214,26 @@ class IntersectingPairedEndAligner : public PairedEndAligner // seed for it not to hit, and since the reads are disjoint there can't be a case // where the same difference caused two seeds to miss). // - void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, bool beginsDisjointHitSet); - void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, bool beginsDisjointHitSet); + void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, const unsigned *unliftedHits, bool beginsDisjointHitSet); + void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, const GenomeLocation *unliftedHits, bool beginsDisjointHitSet); // // This efficiently works through the set looking for the next hit at or below this address. // A HashTableHitSet only allows a single iteration through its address space per call to // init(). // - bool getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound); + bool getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound); // // Walk down just one step, don't binary search. // - bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound); + bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation); // // Find the highest genome address. // - bool getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound); + bool getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation); unsigned computeBestPossibleScoreForCurrentHit(); @@ -377,7 +380,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner // // Returns true and sets oldMatchProbability if this should be eliminated due to a match. // - bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, + bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt, double *oldMatchProbability); }; @@ -394,14 +397,16 @@ class IntersectingPairedEndAligner : public PairedEndAligner // double matchProbability; GenomeLocation readWithMoreHitsGenomeLocation; + GenomeLocation readWithMoreHitsUnliftedGenomeLocation; unsigned bestPossibleScore; unsigned score; unsigned scoreLimit; // The scoreLimit with which score was computed unsigned seedOffset; int genomeOffset; - void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_) { + void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_, GenomeLocation readWithMoreHitsUnliftedGenomeLocation_) { readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_; + readWithMoreHitsUnliftedGenomeLocation = readWithMoreHitsUnliftedGenomeLocation_; bestPossibleScore = bestPossibleScore_; seedOffset = seedOffset_; score = -2; @@ -416,15 +421,17 @@ class IntersectingPairedEndAligner : public PairedEndAligner MergeAnchor * mergeAnchor; unsigned scoringMateCandidateIndex; // Index into the array of scoring mate candidates where we should look GenomeLocation readWithFewerHitsGenomeLocation; + GenomeLocation readWithFewerHitsUnliftedGenomeLocation; unsigned whichSetPair; unsigned seedOffset; unsigned bestPossibleScore; void init(GenomeLocation readWithFewerHitsGenomeLocation_, unsigned whichSetPair_, unsigned scoringMateCandidateIndex_, unsigned seedOffset_, - unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_) + unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_, GenomeLocation readWithFewerHitsUnliftedGenomeLocation_) { readWithFewerHitsGenomeLocation = readWithFewerHitsGenomeLocation_; + readWithFewerHitsUnliftedGenomeLocation = readWithFewerHitsUnliftedGenomeLocation_; whichSetPair = whichSetPair_; _ASSERT(whichSetPair < NUM_SET_PAIRS); // You wouldn't think this would be necessary, but... scoringMateCandidateIndex = scoringMateCandidateIndex_; @@ -435,6 +442,18 @@ class IntersectingPairedEndAligner : public PairedEndAligner } }; + static bool isNonAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate) + { + return candidate->readWithFewerHitsGenomeLocation == candidate->readWithFewerHitsUnliftedGenomeLocation && + mate->readWithMoreHitsGenomeLocation == mate->readWithMoreHitsUnliftedGenomeLocation; + } + + static bool isBothAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate) + { + return candidate->readWithFewerHitsGenomeLocation != candidate->readWithFewerHitsUnliftedGenomeLocation && + mate->readWithMoreHitsGenomeLocation != mate->readWithMoreHitsUnliftedGenomeLocation; + } + // // A pool of scoring candidates. For each alignment call, we free them all by resetting lowestFreeScoringCandidatePoolEntry to 0, // and then fill in the content when they're initialized. This means that for alignments with few candidates we'll be using the same From 6fbf18a49e82fbea4d6d8cc46f42fbdd79095b32 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Sun, 6 Dec 2015 21:25:09 -0800 Subject: [PATCH 02/19] Read genome with alts --- SNAPLib/FASTA.cpp | 37 +-- SNAPLib/FASTA.h | 2 +- SNAPLib/Genome.cpp | 307 ++++++++++++++++++++++- SNAPLib/Genome.h | 51 +++- SNAPLib/GenomeIndex.cpp | 137 +++++++++- SNAPLib/GenomeIndex.h | 15 +- SNAPLib/IntersectingPairedEndAligner.cpp | 30 +-- SNAPLib/IntersectingPairedEndAligner.h | 14 +- 8 files changed, 514 insertions(+), 79 deletions(-) diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp index 25418eef..75d92a08 100644 --- a/SNAPLib/FASTA.cpp +++ b/SNAPLib/FASTA.cpp @@ -36,7 +36,8 @@ ReadFASTAGenome( const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, - unsigned chromosomePaddingSize) + unsigned chromosomePaddingSize, + AltContigMap* altMap) { // // We need to know a bound on the size of the genome before we create the Genome object. @@ -96,33 +97,12 @@ ReadFASTAGenome( // // Now supply the chromosome name. // - if (NULL != pieceNameTerminatorCharacters) { - for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { - char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]); - if (NULL != terminator) { - *terminator = '\0'; - } - } - } - if (spaceIsAPieceNameTerminator) { - char *terminator = strchr(lineBuffer, ' '); - if (NULL != terminator) { - *terminator = '\0'; - } - terminator = strchr(lineBuffer, '\t'); - if (NULL != terminator) { - *terminator = '\0'; - } - } - char *terminator = strchr(lineBuffer, '\n'); - if (NULL != terminator) { - *terminator = '\0'; + char * terminator = Genome::findTerminator(lineBuffer, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator); + if (altMap != NULL) { + altMap->addFastaContig(lineBuffer, terminator); } - terminator = strchr(lineBuffer, '\r'); - if (NULL != terminator) { - *terminator = '\0'; - } - genome->startContig(lineBuffer+1); + *terminator = 0; + genome->startContig(lineBuffer+1, altMap); } else { if (!inAContig) { WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n"); @@ -170,6 +150,9 @@ ReadFASTAGenome( // genome->addData(paddingBuffer); genome->fillInContigLengths(); + if (altMap != NULL) { + genome->adjustAltContigs(altMap); + } genome->sortContigsByName(); fclose(fastaFile); diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h index 44cb810f..cda9a1f8 100644 --- a/SNAPLib/FASTA.h +++ b/SNAPLib/FASTA.h @@ -27,7 +27,7 @@ Revision History: #include "Genome.h" const Genome * -ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize); +ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, AltContigMap* altMap); // // The FASTA appending functions return whether the write was successful. diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 4a0629ec..07460392 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -31,6 +31,7 @@ Revision History: #include "exit.h" #include "Error.h" #include "Util.h" +#include "VariableSizeVector.h" Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs) : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs), @@ -73,7 +74,7 @@ Genome::addData(const char *data) } void -Genome::startContig(const char *contigName) +Genome::startContig(const char *contigName, AltContigMap *altMap) { if (nContigs == maxContigs) { // @@ -102,6 +103,10 @@ Genome::startContig(const char *contigName) strncpy(contigs[nContigs].name,contigName,len); contigs[nContigs].name[len-1] = '\0'; + if (altMap != NULL) { + altMap->setAltContig(&contigs[nContigs]); + } + nContigs++; } @@ -465,6 +470,78 @@ void Genome::fillInContigLengths() contigs[nContigs-1].length = nBases - GenomeLocationAsInt64(contigs[nContigs-1].beginningLocation); } +void Genome::adjustAltContigs(AltContigMap* altMap) +{ + if (altMap == NULL) { + return; + } + bool error = false; + // build parent links from alt contigs + for (int i = 0; i < nContigs; i++) { + if (contigs[i].isAlternate) { + const char* parentName = altMap->getParentContigName(contigs[i].name); + if (parentName == NULL) { + WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); + error = true; + continue; + } + GenomeLocation parentLocation; + int parentIndex; + if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) { + WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name); + error = true; + continue; + } + if (contigs[parentIndex].isAlternate) { + WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName); + error = true; continue; + } + contigs[i].liftedLocation = parentLocation; + } + } + if (error) { + soft_exit(1); + } + + // flip RC contigs + for (int i = 0; i < nContigs; i++) { + if (contigs[i].isAlternate && contigs[i].isReverseStrand) { + util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length); + } + } +} + +char * Genome::findTerminator(char* lineBuffer, const char* pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator) +{ + char* result = lineBuffer + strlen(lineBuffer); + if (NULL != pieceNameTerminatorCharacters) { + for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { + char *terminator = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); + if (NULL != terminator && terminator < result) { + result = terminator; + } + } + } + if (spaceIsAPieceNameTerminator) { + char *terminator = strchr(lineBuffer, ' '); + if (NULL != terminator && terminator < result) { + result = terminator; + } + terminator = strchr(lineBuffer, '\t'); + if (NULL != terminator && terminator < result) { + result = terminator; + } + } + char *terminator = strchr(lineBuffer, '\n'); + if (NULL != terminator) { + result = terminator; + } + terminator = strchr(lineBuffer, '\r'); + if (NULL != terminator) { + result = terminator; + } + return result; +} const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const { const Contig *contig = getContigAtLocation(location); @@ -491,4 +568,230 @@ const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned return contig; } -GenomeLocation InvalidGenomeLocation; // Gets set on genome build/load \ No newline at end of file +GenomeLocation InvalidGenomeLocation; // Gets set on genome build/load + +// terminate string at next tab or newline +// return pointer to beginning of next chunk of data +char* tokenizeToNextTabOrNewline(char* start, bool* endOfLine, bool* endOfFile) +{ + char* p = start; + while (*p) { + if (*p == '\t') { + *p = '\0'; + *endOfLine = false; + *endOfFile = false; + return p + 1; + } else if (*p == '\r' || *p == '\n') { + if (*(p + 1) != *p && (*(p + 1) == '\r' || *(p + 1) == '\n')) { + *p++ = '\0'; + } else { + } + *p = '\0'; + *endOfLine = true; + *endOfFile = false; + return p + 1; + } + p++; + } + *endOfLine = true; + *endOfFile = true; + return p; +} + +static const int ALT_SCAF_ACC = 0; +static const int PARENT_ACC = 1; +static const int ORI = 2; +static const int ALT_SCAF_START = 3; +static const int ALT_SCAF_STOP = 4; +static const int PARENT_START = 5; +static const int PARENT_STOP = 6; +static const int ALT_START_TAIL = 7; +static const int ALT_STOP_TAIL = 8; +static const int N_COLUMNS = 9; + +AltContigMap* AltContigMap::readFromFile(const char* filename, const char* columns) +{ + // just map & copy the whole file into a region of memory + FileMapper map; + if (!map.init(filename)) { +err_map_failed: + WriteErrorMessage("Failed to map file %s\n", filename); + soft_exit(1); + } + _int64 size = map.getFileSize(); + char* buffer = (char*)BigAlloc(size + 1 + strlen(columns) + 1); + void* token; + char* mapped = map.createMapping(0, size, &token); + if (mapped == NULL) { + goto err_map_failed; + } + memcpy(buffer, mapped, size); + buffer[size] = '\0'; + if (strlen(buffer) != size) { + WriteErrorMessage("Nulls in file %s\n", filename); + soft_exit(1); + } + map.unmap(token); + strcpy(buffer + size + 1, columns); + + AltContigMap* result = new AltContigMap(); + // first find accession FASTA tag, add "|" + char* p = buffer + size + 1; + if (*p == '#') { + p++; + } + char* q = strchr(p, ','); + if (q == NULL) { +err_invalid_column_spec: + WriteErrorMessage("Invalid columns spec %s\n", columns); + soft_exit(1); + } + *q = '\0'; + result->accessionFastaTag = p; + + // get names for each column type (last 2 are optional) + p = q + 1; + char* columnNames[N_COLUMNS]; + memset(columnNames, 0, sizeof(char*) * N_COLUMNS); + for (int i = 0; i < N_COLUMNS; i++) { + columnNames[i] = p; + q = strchr(p, ','); + if (q != NULL) { + *q = '\0'; + p = q + 1; + } else { + q = p + strlen(p); + if (i < PARENT_STOP) { + goto err_invalid_column_spec; + } + break; + } + } + + // map column names to indices + VariableSizeVector columnTypes; + p = buffer + (*buffer == '#'); // beginning of buffer, skipping possible comment char + bool endOfLine = false, endOfFile = false; + bool columnFound[N_COLUMNS]; + memset(columnFound, 0, sizeof(bool)* N_COLUMNS); + for (int columnIndex = 0; !endOfLine; columnIndex++) { + q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); + if (q == NULL) { +err_file_format: + WriteErrorMessage("Invalid file format for alt data in %s\n", filename); + soft_exit(1); + } + *q = '\0'; + for (int i = 0; i <= N_COLUMNS; i++) { + if (i < N_COLUMNS && !strcmp(columnNames[i], p)) { + columnTypes.add(i); + columnFound[i] = true; + break; + } else if (i == N_COLUMNS) { + columnTypes.add(N_COLUMNS); // ignore this column + } + } + p = q; + } + for (int i = 0; i < N_COLUMNS; i++) { + if (columnNames[i] != NULL && !columnFound[i]) { + goto err_file_format; + } + } + while (!endOfFile) { + endOfLine = false; + AltContig alt; + for (int columnIndex = 0; !endOfLine; columnIndex++) { + q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); + switch (columnTypes[columnIndex]) { + case ALT_SCAF_ACC: + alt.accession = p; + break; + case PARENT_ACC: + alt.parentAccession = p; + break; + case ORI: + alt.isRC = *p == '-'; + break; + case ALT_SCAF_START: + alt.start = atol(p); + break; + case ALT_SCAF_STOP: + alt.stop = atol(p); + break; + case PARENT_START: + alt.parentStart = atol(p); + break; + case PARENT_STOP: + alt.parentStop = atol(p); + break; + case ALT_START_TAIL: + alt.startTail = atol(p); + break; + case ALT_STOP_TAIL: + alt.stopTail = atol(p); + break; + case N_COLUMNS: + // ignore + break; + default: + _ASSERT(false); + } + p = q; + } + result->altsByAccession[alt.accession] = alt; + } + return result; +} + +void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTerminator) +{ + // get the name + char* name = (char*) malloc(nameTerminator - lineBuffer); + memcpy(name, lineBuffer + 1, nameTerminator - lineBuffer - 1); + name[nameTerminator - lineBuffer - 1] = 0; + + // find the accession number + const char* tag = strstr(lineBuffer, accessionFastaTag); + const char* p = tag + strlen(accessionFastaTag); + if (tag == NULL || *p == '\0') { + WriteErrorMessage("Unable to find accession code for contig %s in FASTA line\n%s\n", name, lineBuffer); + soft_exit(1); + } + const char*q = p; + while (*q != '\0' && *q != '|' && *q != ' ' && *q != '\t' && *q != '\r' && *q != '\n') { + q++; + } + char* accession = (char*)malloc(q - p); + memcpy(accession, p, q - p - 1); + *(accession + (q - p)) = '\0'; + + nameToAccession[name] = accession; +} + +void AltContigMap::setAltContig(Genome::Contig* contig) +{ + StringMap::iterator accession = nameToAccession.find(contig->name); + if (accession != nameToAccession.end()) { + StringAltContigMap::iterator alt = altsByAccession.find(accession->second); + if (alt != altsByAccession.end()) { + contig->isAlternate = true; + contig->isReverseStrand = alt->second.isRC; + return; + } + } + contig->isAlternate = false; + contig->isReverseStrand = false; +} + +const char* AltContigMap::getParentContigName(const char* altName) +{ + StringMap::iterator accession = nameToAccession.find(altName); + if (accession != nameToAccession.end()) { + StringAltContigMap::iterator alt = altsByAccession.find(accession->second); + if (alt != altsByAccession.end()) { + return alt->second.name; + } + } + return NULL; +} diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index d65d9008..f2ee8a67 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -26,6 +26,8 @@ Revision History: #include "Compat.h" #include "GenericFile.h" #include "GenericFile_map.h" +#include +#include // // We have two different classes to represent a place in a genome and a distance between places in a genome. @@ -156,6 +158,8 @@ typedef _int64 GenomeDistance; extern GenomeLocation InvalidGenomeLocation; +class AltContigMap; + class Genome { public: // @@ -174,7 +178,8 @@ class Genome { unsigned maxContigs = 32); void startContig( - const char *contigName); + const char *contigName, + AltContigMap *altMap); void addData( const char *data); @@ -246,14 +251,14 @@ class Genome { struct Contig { Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), - isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {} + isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation) {} GenomeLocation beginningLocation; GenomeDistance length; + bool isAlternate; - int altGroup; // each group of overlapping alt regions is given a unique ID bool isReverseStrand; // if reversed alternate strand GenomeLocation liftedLocation; // location of beginning of alt contig mapping to primary - GenomeLocation contextBefore, contextAfter; // context sequence added from primary (alts near ends have less context) + unsigned nameLength; char *name; }; @@ -278,8 +283,11 @@ class Genome { // These are only public so creators of new genomes (i.e., FASTA) can use them. // void fillInContigLengths(); + void adjustAltContigs(AltContigMap* altMap); void sortContigsByName(); + static char* findTerminator(char* buffer, const char* terminators, bool whitespaceTerminator); + private: static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it @@ -317,3 +325,38 @@ inline bool genomeLocationIsWithin(GenomeLocation locationA, GenomeLocation loca { return DistanceBetweenGenomeLocations(locationA, locationB) <= distance; } + +class AltContigMap +{ +public: + AltContigMap() {} + + static AltContigMap* readFromFile(const char* filename, const char* columnList); + + void addFastaContig(const char* lineBuffer, const char* terminator); + + void setAltContig(Genome::Contig* contig); + + const char* getParentContigName(const char* altName); + +private: + + struct AltContig { + const char* name; + const char* accession; + const char* parentAccession; + bool isRC; + GenomeLocation start, stop; + GenomeLocation parentStart, parentStop; + GenomeLocation startTail, stopTail; + AltContig() : name(NULL), accession(NULL), parentAccession(NULL), isRC(false), + start(0), stop(0), parentStart(0), parentStop(0), startTail(0), stopTail(0) {} + }; + + + const char* accessionFastaTag; + typedef std::map StringAltContigMap; + StringAltContigMap altsByAccession; + typedef std::map StringMap; + StringMap nameToAccession; +}; \ No newline at end of file diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 72f05bd6..8ca51603 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -46,11 +46,12 @@ static const double DEFAULT_SLACK = 0.3; static const unsigned DEFAULT_PADDING = 500; static const unsigned DEFAULT_KEY_BYTES = 4; static const unsigned DEFAULT_LOCATION_SIZE = 4; - +static const char* DEFAULT_ALT_COLUMNS = "gb,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail"; const char *GenomeIndexFileName = "GenomeIndex"; const char *OverflowTableFileName = "OverflowTable"; const char *GenomeIndexHashFileName = "GenomeIndexHash"; const char *GenomeFileName = "Genome"; +const char *LiftedIndexDirName = "Lifted"; static void usage() { @@ -85,12 +86,16 @@ static void usage() " In particular, this will generally use less memory than the index will use once it's built, so if this doesn't work you\n" " won't be able to use the index anyway. However, if you've got sufficient memory to begin with, this option will just\n" " slow down the index build by doing extra, useless IO.\n" + "-altmap file Tab-separated file of alt contig mapping information\n" + "-altcols columns Comma-separated list of columns describing alt mapping file\n" + " Default is v38 %s\n" , DEFAULT_SEED_SIZE, DEFAULT_SLACK, DEFAULT_PADDING, DEFAULT_KEY_BYTES, - DEFAULT_LOCATION_SIZE); + DEFAULT_LOCATION_SIZE, + DEFAULT_ALT_COLUMNS); soft_exit_no_print(1); // Don't use soft-exit, it's confusing people to get an error message after the usage } @@ -121,6 +126,8 @@ GenomeIndex::runIndexer( bool large = false; unsigned locationSize = DEFAULT_LOCATION_SIZE; bool smallMemory = false; + const char* altMapFilename = NULL; + const char* altMapColumns = DEFAULT_ALT_COLUMNS; for (int n = 2; n < argc; n++) { if (strcmp(argv[n], "-s") == 0) { @@ -172,8 +179,7 @@ GenomeIndex::runIndexer( } } else if (argv[n][0] == '-' && argv[n][1] == 's' && argv[n][2] == 'm') { smallMemory = true; - } - else if (strcmp(argv[n], "-keysize") == 0) { + } else if (strcmp(argv[n], "-keysize") == 0) { if (n + 1 < argc) { keySizeInBytes = atoi(argv[n+1]); if (keySizeInBytes < 4 || keySizeInBytes > 8) { @@ -188,6 +194,20 @@ GenomeIndex::runIndexer( pieceNameTerminatorCharacters = argv[n] + 2; } else if (!strcmp(argv[n], "-bSpace")) { spaceIsAPieceNameTerminator = true; + } else if (!strcmp(argv[n], "-altmap")) { + if (n + 1 < argc) { + altMapFilename = argv[n + 1]; + n++; + } else { + usage(); + } + } else if (!strcmp(argv[n], "-altcols")) { + if (n + 1 < argc) { + altMapColumns = argv[n + 1]; + n++; + } else { + usage(); + } } else { WriteErrorMessage("Invalid argument: %s\n\n", argv[n]); usage(); @@ -223,7 +243,10 @@ GenomeIndex::runIndexer( BigAllocUseHugePages = false; _int64 start = timeInMillis(); - const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding); + + AltContigMap* altMap = altMapFilename != NULL ? AltContigMap::readFromFile(altMapFilename, altMapColumns) : NULL; + + const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, altMap); if (NULL == genome) { WriteErrorMessage("Unable to read FASTA file\n"); soft_exit(1); @@ -261,12 +284,17 @@ SetInvalidGenomeLocation(unsigned locationSize) bool GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, bool computeBias, const char *directoryName, unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize, - bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory) + bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory, GenomeIndex* unliftedIndex) { PreventMachineHibernationWhileThisThreadIsAlive(); SetInvalidGenomeLocation(locationSize); + if (genome->hasAltContigs() && smallMemory) { + WriteErrorMessage("Warning: Cannot use small memory to build index with alt contigs, ignoring flag\n"); + smallMemory = false; + } + bool buildHistogram = (histogramFileName != NULL); FILE *histogramFile; if (buildHistogram) { @@ -282,7 +310,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla return false; } - int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1); + int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), __max(strlen(GenomeFileName), strlen(LiftedIndexDirName))))) + 1); char *filenameBuffer = new char[filenameBufferSize]; fprintf(stderr,"Saving genome..."); @@ -421,6 +449,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla threadContexts[i].backpointerSpillLock = &backpointerSpillLock; threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread; threadContexts[i].backpointerSpillFile = backpointerSpillFile; + threadContexts[i].unliftedIndex = unliftedIndex; StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]); } @@ -742,15 +771,27 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla fclose(indexFile); + if (genome->hasAltContigs() && unliftedIndex != NULL) { + // create a sub-index with only seeds that occur in alt contigs + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); + bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, + hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); + if (!ok) { + WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); + soft_exit(1); + return false; + } + } + delete index; if (computeBias && biasTable != NULL) { delete[] biasTable; } - + WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000); delete[] filenameBuffer; - + return true; } @@ -845,7 +886,7 @@ SNAPHashTable** GenomeIndex::allocateHashTables( -GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL) +GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(FALSE), liftedIndex(NULL) { } @@ -885,6 +926,9 @@ GenomeIndex::~GenomeIndex() delete genome; genome = NULL; + if (NULL != liftedIndex) { + delete liftedIndex; + } } void @@ -1171,6 +1215,7 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) const Genome *genome = context->genome; unsigned seedLen = context->seedLen; bool large = context->large; + bool lift = context->unliftedIndex != NULL; // // Batch the insertions into the hash tables, because otherwise we spend all of @@ -1202,7 +1247,12 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) Seed seed(bases, seedLen); - indexSeed(genomeLocation, seed, batches, context, &stats, large); + if (!lift) { + indexSeed(genomeLocation, seed, batches, context, &stats, large); + } + else { + indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + } } // For each genome base in our area // @@ -1224,9 +1274,8 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) } } - -const _int64 GenomeIndex::printPeriod = 100000000; +const _int64 GenomeIndex::printPeriod = 100000000; void @@ -1259,6 +1308,48 @@ GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBat } // If we filled a batch } + void +GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) +{ + // todo: optimize + // if this is first occurrence of seed in unlifted index, checks if seed is in any alts + // and if so, adds all locations to this index, lifting alts to non-alt locations + + _int64 nHits, nRCHits; + if (doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); +#define CHECK_ALTS_AND_ADD_LIFTED \ + if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { \ + bool anyAlts = false; \ + for (int i = 0; i < nHits && ! anyAlts; i++) { \ + anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \ + } \ + for (int i = 0; i < nRCHits && !anyAlts; i++) { \ + anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \ + } \ + if (anyAlts) { \ + for (int i = 0; i < nHits && !anyAlts; i++) { \ + indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); \ + } \ + if (!seed.isOwnReverseComplement()) { \ + Seed rcSeed = ~seed; \ + for (int i = 0; i < nRCHits && !anyAlts; i++) { \ + indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); \ + } \ + } \ + } \ + } + CHECK_ALTS_AND_ADD_LIFTED + } + else { + const unsigned *hits, *rcHits; + context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + CHECK_ALTS_AND_ADD_LIFTED + } +} + void GenomeIndex::ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement, _int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large) @@ -1929,6 +2020,16 @@ GenomeIndex::lookupSeedAlt32( lookupSeed32(seed, nHits, hits, nRCHits, rcHits); *unliftedHits = *hits; *unliftedRCHits = *rcHits; + if (hasAlts) { + _int64 nLiftedHits, nLiftedRCHits; + const unsigned *liftedHits, *liftedRCHits; + liftedIndex->lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits); + if (nLiftedHits != 0 || nLiftedRCHits != 0) { + _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits); + *hits = liftedHits; + *rcHits = liftedRCHits; + } + } } void @@ -2073,6 +2174,16 @@ GenomeIndex::lookupSeedAlt( lookupSeed(seed, nHits, hits, nRCHits, rcHits, singleHit, singleRCHit); *unliftedHits = *hits; *unliftedRCHits = *rcHits; + if (hasAlts) { + _int64 nLiftedHits, nLiftedRCHits; + const GenomeLocation *liftedHits, *liftedRCHits; + liftedIndex->lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, singleHit + 1, singleRCHit + 1); + if (nLiftedHits != 0 || nLiftedRCHits != 0) { + _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits); + *hits = liftedHits; + *rcHits = liftedRCHits; + } + } } void diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index 2cdd0782..49eea33b 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -93,6 +93,12 @@ class GenomeIndex { unsigned nHashTables; const Genome *genome; + // TRUE if genome has alt contigs + bool hasAlts; + + // secondary index for all seeds that map to alt contigs with locations lifted to non-alt contigs + GenomeIndex* liftedIndex; + bool largeHashTable; unsigned locationSize; @@ -154,12 +160,13 @@ class GenomeIndex { // Build a genome index and write it to a directory. If you don't already have a saved index // the only way to get one is to build it into a directory and then load it from the directory. // NB: This deletes the Genome that's passed into it. + // unliftedIndex is an internal parameter used to build 2-level index for genomes with alt contigs // static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, - bool computeBias, const char *directory, + bool computeBias, const char *directoryName, unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize, bool large, const char *histogramFileName, - unsigned locationSize, bool smallMemory); + unsigned locationSize, bool smallMemory, GenomeIndex *unliftedIndex = NULL); // @@ -233,6 +240,9 @@ class GenomeIndex { ExclusiveLock *backpointerSpillLock; FILE *backpointerSpillFile; + // used for building sub-index of only seeds that occur in alt contigs + GenomeIndex *unliftedIndex; + ExclusiveLock *hashTableLocks; ExclusiveLock *overflowTableLock; }; @@ -281,6 +291,7 @@ class GenomeIndex { static const _int64 printPeriod; virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); + virtual void indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); static void BuildHashTablesWorkerThreadMain(void *param); diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index a0d1c759..c0c30b3e 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -342,17 +342,14 @@ IntersectingPairedEndAligner::align( if (doesGenomeIndexHave64BitLocations) { index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); - } - else { + } else { index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]); } - } - else { + } else { if (doesGenomeIndexHave64BitLocations) { index->lookupSeedAlt(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &unliftedHits[FORWARD], &unliftedHits[RC], hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation()); - } - else { + } else { index->lookupSeedAlt32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC], &unliftedHits32[FORWARD], &unliftedHits32[RC]); } } @@ -370,16 +367,13 @@ IntersectingPairedEndAligner::align( if (!doesGenomeIndexHaveAlts) { if (doesGenomeIndexHave64BitLocations) { hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], NULL, beginsDisjointHitSet[dir]); - } - else { + } else { hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], NULL, beginsDisjointHitSet[dir]); } - } - else { + } else { if (doesGenomeIndexHave64BitLocations) { hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], unliftedHits[dir], beginsDisjointHitSet[dir]); - } - else { + } else { hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], unliftedHits32[dir], beginsDisjointHitSet[dir]); } } @@ -687,11 +681,11 @@ IntersectingPairedEndAligner::align( // reduce probability of pairs matching across different overlapping alts // todo: assuming if they're on different alts within maxSpacing they overlap - true for GRCh38 but not necessarily for all genomes - // use crossover probability with 1 centiMorgan ~= 1Mbp - if (doesGenomeIndexHaveAlts && isBothAltPairMapping(candidate, mate) && - abs(mate->readWithMoreHitsUnliftedGenomeLocation - candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) + // use crossover probability with 1 centiMorgan ~= 1Mbp - too strict? + if (doesGenomeIndexHaveAlts && candidate->isAlt() && mate->isAlt() && + DistanceBetweenGenomeLocations(mate->readWithMoreHitsUnliftedGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) { - pairProbability *= 1e-8 * abs(candidate->readWithFewerHitsGenomeLocation - mate->readWithMoreHitsGenomeLocation); + pairProbability *= 1e-8 * DistanceBetweenGenomeLocations(candidate->readWithFewerHitsGenomeLocation, mate->readWithMoreHitsGenomeLocation); } // @@ -753,7 +747,7 @@ IntersectingPairedEndAligner::align( candidate->mergeAnchor = mergeAnchor; } else { merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, - pairProbability, pairScore, doesGenomeIndexHaveAlts && isNonAltPairMapping(candidate, mate), &oldPairProbability); + pairProbability, pairScore, doesGenomeIndexHaveAlts && (! candidate->isAlt()) && (!mate->isAlt()), &oldPairProbability); } if (!merged) { @@ -768,7 +762,7 @@ IntersectingPairedEndAligner::align( if (pairScore <= maxK && (pairScore < bestPairScore || (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair || - (pairProbability == probabilityOfBestPair && isNonAltPairMapping(candidate, mate)))))) { + (pairProbability == probabilityOfBestPair && (! candidate->isAlt()) && (!mate->isAlt())))))) { // // A new best hit. // diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h index e951f64b..403e28e6 100644 --- a/SNAPLib/IntersectingPairedEndAligner.h +++ b/SNAPLib/IntersectingPairedEndAligner.h @@ -414,6 +414,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner matchProbability = 0; genomeOffset = 0; } + bool isAlt() const { return readWithMoreHitsGenomeLocation != readWithMoreHitsUnliftedGenomeLocation; } }; struct ScoringCandidate { @@ -440,20 +441,9 @@ class IntersectingPairedEndAligner : public PairedEndAligner scoreListNext = scoreListNext_; mergeAnchor = NULL; } + bool isAlt() const { return readWithFewerHitsGenomeLocation != readWithFewerHitsUnliftedGenomeLocation; } }; - static bool isNonAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate) - { - return candidate->readWithFewerHitsGenomeLocation == candidate->readWithFewerHitsUnliftedGenomeLocation && - mate->readWithMoreHitsGenomeLocation == mate->readWithMoreHitsUnliftedGenomeLocation; - } - - static bool isBothAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate) - { - return candidate->readWithFewerHitsGenomeLocation != candidate->readWithFewerHitsUnliftedGenomeLocation && - mate->readWithMoreHitsGenomeLocation != mate->readWithMoreHitsUnliftedGenomeLocation; - } - // // A pool of scoring candidates. For each alignment call, we free them all by resetting lowestFreeScoringCandidatePoolEntry to 0, // and then fill in the content when they're initialized. This means that for alignments with few candidates we'll be using the same From 3046b3081c7bdb1fc9160ef3481b2e2434eb1356 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Mon, 7 Dec 2015 11:07:29 -0800 Subject: [PATCH 03/19] Genome index read/write --- SNAPLib/FASTA.cpp | 31 ++++++++++- SNAPLib/Genome.cpp | 118 +++++++++++++++++++++------------------- SNAPLib/Genome.h | 12 ++-- SNAPLib/GenomeIndex.cpp | 67 ++++++++++++++++------- SNAPLib/GenomeIndex.h | 8 ++- 5 files changed, 149 insertions(+), 87 deletions(-) diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp index 75d92a08..659bc990 100644 --- a/SNAPLib/FASTA.cpp +++ b/SNAPLib/FASTA.cpp @@ -97,11 +97,38 @@ ReadFASTAGenome( // // Now supply the chromosome name. // - char * terminator = Genome::findTerminator(lineBuffer, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator); + char * terminator = lineBuffer + strlen(lineBuffer); + char * p; + if (NULL != pieceNameTerminatorCharacters) { + for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { + p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); + if (NULL != p && p < terminator) { + terminator = p; + } + } + } + if (spaceIsAPieceNameTerminator) { + p = strchr(lineBuffer, ' '); + if (NULL != p && p < terminator) { + terminator = p; + } + p = strchr(lineBuffer, '\t'); + if (NULL != p && p < terminator) { + terminator = p; + } + } + p = strchr(lineBuffer, '\n'); + if (NULL != p && p < terminator) { + terminator = p; + } + p = strchr(lineBuffer, '\r'); + if (NULL != p && p < terminator) { + terminator = p; + } if (altMap != NULL) { altMap->addFastaContig(lineBuffer, terminator); } - *terminator = 0; + *terminator = '\0'; genome->startContig(lineBuffer+1, altMap); } else { if (!inAContig) { diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 07460392..2da7421a 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -35,7 +35,7 @@ Revision History: Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs) : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs), - mappedFile(NULL) +mappedFile(NULL), minAltLocation(i_maxBases) { bases = ((char *) BigAlloc(nBasesStored + 2 * N_PADDING)) + N_PADDING; if (NULL == bases) { @@ -154,7 +154,13 @@ Genome::saveToFile(const char *fileName) const curChar = contigs[i].name + n; if (*curChar == ' '){ *curChar = '_'; } } - fprintf(saveFile,"%lld %s\n",contigs[i].beginningLocation, contigs[i].name); + if (!hasAltContigs()) { + // backward compatible for genomes without alts + fprintf(saveFile, "%lld %s\n", contigs[i].beginningLocation, contigs[i].name); + } else { + fprintf(saveFile, "%lld %s %d %d %lld\n", contigs[i].beginningLocation, contigs[i].name, + contigs[i].isAlternate ? 1 : 0, contigs[i].isAlternateRC ? 1 : 0, contigs[i].liftedLocation); + } } // @@ -223,9 +229,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc int contigNameBufferSize = 0; char *contigNameBuffer = NULL; - unsigned n; - size_t contigSize; - char *curName; + genome->minAltLocation = nBases; for (unsigned i = 0; i < nContigs; i++) { if (NULL == reallocatingFgetsGenericFile(&contigNameBuffer, &contigNameBufferSize, loadFile)) { WriteErrorMessage("Unable to read contig description\n"); @@ -234,29 +238,48 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc return NULL; } - for (n = 0; n < (unsigned)contigNameBufferSize; n++) { - if (contigNameBuffer[n] == ' ') { - contigNameBuffer[n] = '\0'; - break; - } - } - + contigNameBuffer[contigNameBufferSize - 1] = '\0'; _int64 contigStart; - if (1 != sscanf(contigNameBuffer, "%lld", &contigStart)) { - WriteErrorMessage("Unable to parse contig start in genome file '%s', '%s%'\n", fileName, contigNameBuffer); + const char* SEP = " \n\r"; + char *token = strtok(contigNameBuffer, SEP); + if (token == NULL || 1 != sscanf(token, "%lld", &contigStart)) { +err_contig_parse: + WriteErrorMessage("Unable to parse contigs in genome file '%s', '%s%'\n", fileName, contigNameBuffer); soft_exit(1); } genome->contigs[i].beginningLocation = GenomeLocation(contigStart); - contigNameBuffer[n] = ' '; - n++; // increment n so we start copying at the position after the space - contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n - genome->contigs[i].name = new char[contigSize + 1]; - genome->contigs[i].nameLength = (unsigned)contigSize; - curName = genome->contigs[i].name; - for (unsigned pos = 0; pos < contigSize; pos++) { - curName[pos] = contigNameBuffer[pos + n]; - } - curName[contigSize] = '\0'; + token = strtok(NULL, SEP); + if (token == NULL) goto err_contig_parse; + genome->contigs[i].name = new char[strlen(token) + 1]; + genome->contigs[i].nameLength = (unsigned)strlen(token); + strcpy(genome->contigs[i].name, token); + token = strtok(NULL, SEP); + if (token == NULL) { + genome->contigs[i].isAlternate = false; + genome->contigs[i].isAlternateRC = false; + genome->contigs[i].liftedLocation = InvalidGenomeLocation; + } else { + int isAlternate; + if (1 != sscanf(token, "%d", &isAlternate)) { + goto err_contig_parse; + } + genome->contigs[i].isAlternate = isAlternate != 0; + int isAlternateRC; + if (token == NULL || 1 != sscanf(token, "%d", &isAlternateRC)) { + goto err_contig_parse; + } + genome->contigs[i].isAlternateRC = isAlternateRC != 0; + _int64 liftedLocation; + if (token == NULL || 1 != sscanf(token, "%lld", &liftedLocation)) { + goto err_contig_parse; + } + genome->contigs[i].liftedLocation = liftedLocation; + + if (isAlternate && contigStart < genome->minAltLocation.location) { + genome->minAltLocation = contigStart; + } + } + } // for each contig if (0 != loadFile->advance(GenomeLocationAsInt64(minLocation))) { @@ -476,9 +499,13 @@ void Genome::adjustAltContigs(AltContigMap* altMap) return; } bool error = false; - // build parent links from alt contigs + // build parent links from alt contigs, and find minAltLocation + minAltLocation = maxBases; for (int i = 0; i < nContigs; i++) { if (contigs[i].isAlternate) { + if (contigs[i].beginningLocation < minAltLocation) { + minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2; + } const char* parentName = altMap->getParentContigName(contigs[i].name); if (parentName == NULL) { WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); @@ -505,43 +532,24 @@ void Genome::adjustAltContigs(AltContigMap* altMap) // flip RC contigs for (int i = 0; i < nContigs; i++) { - if (contigs[i].isAlternate && contigs[i].isReverseStrand) { + if (contigs[i].isAlternate && contigs[i].isAlternateRC) { util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length); } } } -char * Genome::findTerminator(char* lineBuffer, const char* pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator) +GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const { - char* result = lineBuffer + strlen(lineBuffer); - if (NULL != pieceNameTerminatorCharacters) { - for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { - char *terminator = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); - if (NULL != terminator && terminator < result) { - result = terminator; - } - } - } - if (spaceIsAPieceNameTerminator) { - char *terminator = strchr(lineBuffer, ' '); - if (NULL != terminator && terminator < result) { - result = terminator; - } - terminator = strchr(lineBuffer, '\t'); - if (NULL != terminator && terminator < result) { - result = terminator; - } - } - char *terminator = strchr(lineBuffer, '\n'); - if (NULL != terminator) { - result = terminator; + if (minAltLocation < minAltLocation) { + return altLocation; } - terminator = strchr(lineBuffer, '\r'); - if (NULL != terminator) { - result = terminator; + const Contig* alt = getContigAtLocation(altLocation); + if (alt == NULL) { + return altLocation; } - return result; + return alt->liftedLocation + (altLocation - alt->beginningLocation); // todo: padding?? } + const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const { const Contig *contig = getContigAtLocation(location); @@ -776,12 +784,12 @@ void AltContigMap::setAltContig(Genome::Contig* contig) StringAltContigMap::iterator alt = altsByAccession.find(accession->second); if (alt != altsByAccession.end()) { contig->isAlternate = true; - contig->isReverseStrand = alt->second.isRC; + contig->isAlternateRC = alt->second.isRC; return; } } contig->isAlternate = false; - contig->isReverseStrand = false; + contig->isAlternateRC = false; } const char* AltContigMap::getParentContigName(const char* altName) diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index f2ee8a67..4b6d926c 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -251,12 +251,12 @@ class Genome { struct Contig { Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), - isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation) {} + isAlternate(false), isAlternateRC(false), liftedLocation(InvalidGenomeLocation) {} GenomeLocation beginningLocation; GenomeDistance length; bool isAlternate; - bool isReverseStrand; // if reversed alternate strand + bool isAlternateRC; // if reversed alternate strand GenomeLocation liftedLocation; // location of beginning of alt contig mapping to primary unsigned nameLength; @@ -272,9 +272,9 @@ class Genome { const Contig *getNextContigAfterLocation(GenomeLocation location) const; int getContigNumAtLocation(GenomeLocation location) const; // Returns the contig number, which runs from 0 .. getNumContigs() - 1. - inline bool hasAltContigs() const { return FALSE; } // todo: implement + inline bool hasAltContigs() const { return minAltLocation < maxBases; } - GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation; } // todo: implement + GenomeLocation getLiftedLocation(GenomeLocation altLocation) const; // unused Genome *copy() const {return copy(true,true,true);} // unused Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);} @@ -286,8 +286,6 @@ class Genome { void adjustAltContigs(AltContigMap* altMap); void sortContigsByName(); - static char* findTerminator(char* buffer, const char* terminators, bool whitespaceTerminator); - private: static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it @@ -301,6 +299,8 @@ class Genome { GenomeLocation minLocation; GenomeLocation maxLocation; + GenomeLocation minAltLocation; + // // A genome is made up of a bunch of contigs, typically chromosomes. Contigs have names, // which are stored here. diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 8ca51603..717cb58c 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -487,8 +487,12 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // We're done with the raw genome. Delete it to save some memory. // - delete genome; - genome = NULL; + bool genomeHasAlts = genome->hasAltContigs(); + if (! (genomeHasAlts && unliftedIndex == NULL)) { + // delete if we won't need it later + delete genome; + genome = NULL; + } char *halfBuiltHashTableSpillFileName = NULL; @@ -766,16 +770,21 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla return false; } - fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", GenomeIndexFormatMajorVersion, GenomeIndexFormatMinorVersion, index->nHashTables, + fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", + // NOTE: this must be changed if the format no longer supports v5 (pre-alt) + genomeHasAlts ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts, + GenomeIndexFormatMinorVersion, index->nHashTables, index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize); fclose(indexFile); - if (genome->hasAltContigs() && unliftedIndex != NULL) { + if (genomeHasAlts && unliftedIndex == NULL) { // create a sub-index with only seeds that occur in alt contigs snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); + delete genome; + genome = NULL; if (!ok) { WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); soft_exit(1); @@ -1697,7 +1706,7 @@ GenomeIndex::printBiasTables() } GenomeIndex * -GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) +GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex) { int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1); char *filenameBuffer = new char[filenameBufferSize]; @@ -1740,7 +1749,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) indexFile->close(); delete indexFile; - if (majorVersion != GenomeIndexFormatMajorVersion) { + if (majorVersion != GenomeIndexFormatMajorVersion && majorVersion != GenomeIndexFormatMajorVersionWithoutAlts) { WriteErrorMessage("This genome index appears to be from a different version of SNAP than this, and so we can't read it. Index version %d, SNAP index format version %d\n", majorVersion, GenomeIndexFormatMajorVersion); soft_exit(1); @@ -1920,22 +1929,38 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch) blobFile = NULL; } - snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); - if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { - WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); - delete[] filenameBuffer; - delete index; - return NULL; - } + if (!liftedIndex) { + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); + if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { + WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); + delete[] filenameBuffer; + delete index; + return NULL; + } - if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) { - WriteErrorMessage("\nThis index has too many overflow entries to be valid. Some early versions of SNAP\n" - "allowed building indices with too small of a seed size, and this appears to be such\n" - "an index. You can no longer build indices like this, and you also can't use them\n" - "because they are corrupt and would produce incorrect results. Please use an index\n" - "built with a larger seed size. For hg19, the seed size must be at least 19.\n" - "For other reference genomes this quantity will vary.\n"); - soft_exit(1); + if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) { + WriteErrorMessage("\nThis index has too many overflow entries to be valid. Some early versions of SNAP\n" + "allowed building indices with too small of a seed size, and this appears to be such\n" + "an index. You can no longer build indices like this, and you also can't use them\n" + "because they are corrupt and would produce incorrect results. Please use an index\n" + "built with a larger seed size. For hg19, the seed size must be at least 19.\n" + "For other reference genomes this quantity will vary.\n"); + soft_exit(1); + } + + if (index->genome->hasAltContigs()) { + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); + index->liftedIndex = loadFromDirectory(filenameBuffer, map, prefetch, true); + if (index->liftedIndex == NULL) { + WriteErrorMessage("Missing alt index directory %s\n", filenameBuffer); + soft_exit(1); + } + index->liftedIndex->genome = index->genome; + } else { + index->liftedIndex = NULL; + } + } else { + index->genome = NULL; } delete[] filenameBuffer; diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index 49eea33b..ca45201e 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -82,7 +82,7 @@ class GenomeIndex { // static void runIndexer(int argc, const char **argv); - static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch); + static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex = false); static void printBiasTables(); @@ -175,9 +175,11 @@ class GenomeIndex { static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack, int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL); - static const unsigned GenomeIndexFormatMajorVersion = 5; + static const unsigned GenomeIndexFormatMajorVersion = 6; static const unsigned GenomeIndexFormatMinorVersion = 0; - + // NOTE: this must be changed if the format no longer supports v5 (pre-alt) + static const unsigned GenomeIndexFormatMajorVersionWithoutAlts = 5; + static const unsigned largestBiasTable = 32; // Can't be bigger than the biggest seed size, which is set in Seed.h. Bigger than 32 means a new Seed structure. static const unsigned largestKeySize = 8; static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1]; From a4b492deb8d3ef0e9e9a49842313001452e37f3c Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Mon, 7 Dec 2015 20:55:49 -0800 Subject: [PATCH 04/19] Alt test data generator --- tests/alttestgen.py | 125 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 125 insertions(+) create mode 100644 tests/alttestgen.py diff --git a/tests/alttestgen.py b/tests/alttestgen.py new file mode 100644 index 00000000..16769edc --- /dev/null +++ b/tests/alttestgen.py @@ -0,0 +1,125 @@ +# alttestgen.py +# +# generate data files for alt-contig test +# +# test is alttest.py +# + +import sys +import os +import shutil +import subprocess +import random + +import pandas as pd + +BASES = "ACTG" +RCBASES = {"A":"T", "T":"A", "C":"G", "G":"C"} + +def random_bases(n): + result = "" + for i in range(n): + result = result + random.choice(BASES) + return result + +def random_mutate(seq, p = 0.02): + for i in range(len(seq)): + if random.random() <= p: + b = "ACTG".find(seq[i:i+1]) + seq = seq[:i] + random.choice(BASES[:b] + BASES[b+1:]) + seq[i + 1:] + return seq + +def rc(seq): + result = "" + for c in seq: + result = RCBASES[c] + result + return result + +class Read: + def __init__(self, id, chr, pos, seq, qual=None): + self.id = id + self.chr = chr + self.pos = pos + self.seq = seq + self.qual = qual + + def __str__(self): + return "Read({}, {}, {}, {})".format(self.id, self.chr, self.pos, self.seq) + + def to_sam_pair(self, other): + r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + self.id, 99, self.chr, self.pos, 60, len(self.seq), other.chr, + other.pos, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq)) + return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + other.id, 147, other.chr, other.pos, 60, len(other.seq), self.chr, + self.pos, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq)) + +class Contig: + def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False): + self.name = name + self.accession = accession + self.seq = seq + self.isAlt = isAlt + self.parent = parent + self.parentLoc = parentLoc + self.isAltRC = isAltRC + + def __str__(self): + return "Contig({}, {}, {}, {}, {}, {}, {})".format( + self.name, self.accession, self.seq, 'alt' if self.isAlt else 'ref', + self.parent, self.parentLoc, 'rc' if self.isAltRC else '') + +class Genome: + def __init__(self, contigs={}): + self.contigs = contigs + + def add(self, contig): + self.contigs[contig.name] = contig + + def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.1): + pc = self.contigs[parent] + altseq = random_mutate(pc.seq[start:stop], pmut) + if (isRC): + altseq = rc(altseq) + self.add(Contig(name, accession, altseq, True, parent, start, isRC)) + + def get_seq(self, chr, start, end): + return self.contigs[chr].seq[start:end] + + def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None): + if id == None: + id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos, ('r' if isRC else 'f')) + return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut)) + + def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): + id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1, chr2, pos2) + r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") + r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") + return [r1, r2] + + def write_fasta(self, filename): + with open(filename, 'w') as file: + for contig in self.contigs.values(): + file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) + for i in range(0, len(contig.seq), 80): + file.write("{}\n".format(contig.seq[i:i+80])) + + def write_alts(self, filename): + with open(filename, 'w') as file: + file.write("#alt_scaf_acc\tparent_acc\tori\talt_scaf_start\talt_scaf_stop\tparent_start\tparent_stop\talt_start_tail\talt_stop_tail\n") + for contig in self.contigs.values(): + if contig.isAlt: + file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format( + contig.accession, self.contigs[contig.parent].accession, '-' if contig.isAltRC else '+', + 1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0)) + +g = Genome() +g.add(Contig("chr1", "C01", random_bases(2000))) +g.add_alt("chr1a", "C01A", "chr1", 500, 1500) +g.write_fasta("test.fa") +g.write_alts("test_alts.txt") + +with open("test.sam", "w") as file: + for i in range(0, 101, 10): + [r1, r2] = g.make_pair('chr1', 500 + i, 'chr1a', i) + file.write(r1.to_sam_pair(r2)) From 7e09023244279c0e6ec39427cb5ae5efcbe91339 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Tue, 8 Dec 2015 16:18:23 -0800 Subject: [PATCH 05/19] Simple test runs --- SNAPLib/BaseAligner.cpp | 42 +++------- SNAPLib/BaseAligner.h | 22 ++++++ SNAPLib/Genome.cpp | 98 ++++++++++++++---------- SNAPLib/Genome.h | 3 +- SNAPLib/GenomeIndex.cpp | 67 +++++++++++----- SNAPLib/IntersectingPairedEndAligner.cpp | 21 ++++- tests/alttestgen.py | 34 ++++---- 7 files changed, 179 insertions(+), 108 deletions(-) diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index 02efa6c0..39c3489f 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -44,27 +44,6 @@ using std::min; #define TRACE(...) {} #endif - -typedef struct MatchInfo -{ - GenomeLocation location; - GenomeLocation liftedLocation; - double matchProbability; - - MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) : - location(_loc), liftedLocation(_lifted), matchProbability(_p) {} -} MatchInfo; - -bool -matchInfoComparator( - const MatchInfo& a, - const MatchInfo& b) -{ - return a.liftedLocation < b.liftedLocation; -} - -typedef VariableSizeVector MatchInfoVector; - BaseAligner::BaseAligner( GenomeIndex *i_genomeIndex, unsigned i_maxHitsToConsider, @@ -85,7 +64,7 @@ BaseAligner::BaseAligner( genomeIndex(i_genomeIndex), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK), maxReadSize(i_maxReadSize), maxSeedsToUseFromCommandLine(i_maxSeedsToUseFromCommandLine), maxSeedCoverage(i_maxSeedCoverage), readId(-1), extraSearchDepth(i_extraSearchDepth), - explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), + explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), allMatches(NULL), noUkkonen(i_noUkkonen), noOrderedEvaluation(i_noOrderedEvaluation), noTruncation(i_noTruncation), minWeightToCheck(max(1u, i_minWeightToCheck)), maxSecondaryAlignmentsPerContig(i_maxSecondaryAlignmentsPerContig) /*++ @@ -247,6 +226,9 @@ Routine Description: } hashTableEpoch = 0; + if (genome->hasAltContigs()) { + allMatches = new MatchInfoVector(); + } } @@ -677,8 +659,7 @@ Return Value: * Add up the highest-probability matches of all overlapping alternates */ double -computeLiftedCandidateProbability( - MatchInfoVector* allMatches, +BaseAligner::computeLiftedCandidateProbability( GenomeDistance length) { std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator); @@ -796,10 +777,9 @@ Return Value: #endif unsigned weightListToCheck = highestUsedWeightList; - MatchInfoVector* allMatches = NULL; - bool anyAltMatches = FALSE; - if (genome->hasAltContigs()) { - allMatches = new MatchInfoVector(); + bool anyAltMatches = false; + if (allMatches != NULL) { + allMatches->clear(); } do { @@ -822,7 +802,7 @@ Return Value: if (bestScore <= maxK) { primaryResult->location = bestScoreGenomeLocation; if (anyAltMatches) { - probabilityOfAllCandidates = computeLiftedCandidateProbability(allMatches, read[0]->getDataLength()); + probabilityOfAllCandidates = computeLiftedCandidateProbability(read[0]->getDataLength()); } primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped); if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) { @@ -976,8 +956,8 @@ Return Value: // remember in case there are alt matches if (allMatches != NULL) { - if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) { - anyAltMatches = TRUE; + if ((! anyAltMatches) && genome->getLiftedLocation(genomeLocation) != genomeLocation) { + anyAltMatches = true; } allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability)); } diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h index 61a24cba..ee369a7c 100644 --- a/SNAPLib/BaseAligner.h +++ b/SNAPLib/BaseAligner.h @@ -34,6 +34,7 @@ Revision History: #include "AlignerStats.h" #include "directions.h" #include "GenomeIndex.h" +#include "VariableSizeVector.h" extern bool doAlignerPrefetch; @@ -326,6 +327,27 @@ class BaseAligner { AlignerStats *stats; + typedef struct MatchInfo + { + GenomeLocation location; + GenomeLocation liftedLocation; + double matchProbability; + + MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) : + location(_loc), liftedLocation(_lifted), matchProbability(_p) {} + } MatchInfo; + + static bool matchInfoComparator(const BaseAligner::MatchInfo& a, const BaseAligner::MatchInfo& b) + { + return a.liftedLocation < b.liftedLocation; + } + + typedef VariableSizeVector MatchInfoVector; + + MatchInfoVector* allMatches; + + double computeLiftedCandidateProbability(GenomeDistance length); + unsigned *hitCountByExtraSearchDepth; // How many hits at each depth bigger than the current best edit distance. // So if the current best hit has edit distance 2, then hitCountByExtraSearchDepth[0] would // be the count of hits at edit distance 2, while hitCountByExtraSearchDepth[2] would be the count diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 2da7421a..06c350e1 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -265,18 +265,20 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc } genome->contigs[i].isAlternate = isAlternate != 0; int isAlternateRC; + token = strtok(NULL, SEP); if (token == NULL || 1 != sscanf(token, "%d", &isAlternateRC)) { goto err_contig_parse; } genome->contigs[i].isAlternateRC = isAlternateRC != 0; _int64 liftedLocation; + token = strtok(NULL, SEP); if (token == NULL || 1 != sscanf(token, "%lld", &liftedLocation)) { goto err_contig_parse; } genome->contigs[i].liftedLocation = liftedLocation; if (isAlternate && contigStart < genome->minAltLocation.location) { - genome->minAltLocation = contigStart; + genome->minAltLocation = contigStart - chromosomePadding / 2; } } @@ -309,6 +311,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc } genome->fillInContigLengths(); + genome->adjustAltContigs(NULL); genome->sortContigsByName(); delete[] contigNameBuffer; return genome; @@ -495,39 +498,38 @@ void Genome::fillInContigLengths() void Genome::adjustAltContigs(AltContigMap* altMap) { - if (altMap == NULL) { - return; - } - bool error = false; - // build parent links from alt contigs, and find minAltLocation - minAltLocation = maxBases; - for (int i = 0; i < nContigs; i++) { - if (contigs[i].isAlternate) { - if (contigs[i].beginningLocation < minAltLocation) { - minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2; - } - const char* parentName = altMap->getParentContigName(contigs[i].name); - if (parentName == NULL) { - WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); - error = true; - continue; - } - GenomeLocation parentLocation; - int parentIndex; - if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) { - WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name); - error = true; - continue; - } - if (contigs[parentIndex].isAlternate) { - WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName); - error = true; continue; + if (altMap != NULL) { + bool error = false; + // build parent links from alt contigs, and find minAltLocation + minAltLocation = maxBases; + for (int i = 0; i < nContigs; i++) { + if (contigs[i].isAlternate) { + if (contigs[i].beginningLocation < minAltLocation) { + minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2; + } + const char* parentName = altMap->getParentContigName(contigs[i].name); + if (parentName == NULL) { + WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); + error = true; + continue; + } + GenomeLocation parentLocation; + int parentIndex; + if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) { + WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name); + error = true; + continue; + } + if (contigs[parentIndex].isAlternate) { + WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName); + error = true; continue; + } + contigs[i].liftedLocation = parentLocation; } - contigs[i].liftedLocation = parentLocation; } - } - if (error) { - soft_exit(1); + if (error) { + soft_exit(1); + } } // flip RC contigs @@ -540,14 +542,14 @@ void Genome::adjustAltContigs(AltContigMap* altMap) GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const { - if (minAltLocation < minAltLocation) { + if (altLocation < minAltLocation) { return altLocation; } const Contig* alt = getContigAtLocation(altLocation); - if (alt == NULL) { + if (alt == NULL || ! alt->isAlternate) { return altLocation; } - return alt->liftedLocation + (altLocation - alt->beginningLocation); // todo: padding?? + return alt->liftedLocation + (altLocation - alt->beginningLocation); } const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const @@ -655,7 +657,10 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum soft_exit(1); } *q = '\0'; - result->accessionFastaTag = p; + char * tag = (char*) malloc(q - p + 2); + strcpy(tag, p); + strcat(tag, "|"); + result->accessionFastaTag = tag; // get names for each column type (last 2 are optional) p = q + 1; @@ -689,7 +694,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum WriteErrorMessage("Invalid file format for alt data in %s\n", filename); soft_exit(1); } - *q = '\0'; for (int i = 0; i <= N_COLUMNS; i++) { if (i < N_COLUMNS && !strcmp(columnNames[i], p)) { columnTypes.add(i); @@ -711,6 +715,9 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum AltContig alt; for (int columnIndex = 0; !endOfLine; columnIndex++) { q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); + if (endOfFile) { + break; + } switch (columnTypes[columnIndex]) { case ALT_SCAF_ACC: alt.accession = p; @@ -747,7 +754,9 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum } p = q; } - result->altsByAccession[alt.accession] = alt; + if (!endOfFile) { + result->altsByAccession[alt.accession] = alt; + } } return result; } @@ -771,10 +780,16 @@ void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTermin q++; } char* accession = (char*)malloc(q - p); - memcpy(accession, p, q - p - 1); + memcpy(accession, p, q - p); *(accession + (q - p)) = '\0'; nameToAccession[name] = accession; + accessionToName[accession] = name; + + StringAltContigMap::iterator alt = altsByAccession.find(accession); + if (alt != altsByAccession.end()) { + alt->second.name = name; + } } void AltContigMap::setAltContig(Genome::Contig* contig) @@ -798,7 +813,10 @@ const char* AltContigMap::getParentContigName(const char* altName) if (accession != nameToAccession.end()) { StringAltContigMap::iterator alt = altsByAccession.find(accession->second); if (alt != altsByAccession.end()) { - return alt->second.name; + StringMap::iterator parent = accessionToName.find(alt->second.parentAccession); + if (parent != accessionToName.end()) { + return parent->second.data(); + } } } return NULL; diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index 4b6d926c..38c8d087 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -359,4 +359,5 @@ class AltContigMap StringAltContigMap altsByAccession; typedef std::map StringMap; StringMap nameToAccession; -}; \ No newline at end of file + StringMap accessionToName; +}; diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 717cb58c..fa110167 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -313,19 +313,22 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), __max(strlen(GenomeFileName), strlen(LiftedIndexDirName))))) + 1); char *filenameBuffer = new char[filenameBufferSize]; - fprintf(stderr,"Saving genome..."); - _int64 start = timeInMillis(); - snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); - if (!genome->saveToFile(filenameBuffer)) { - WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n"); - delete[] filenameBuffer; - return false; + _int64 start; + if (unliftedIndex == NULL) { + fprintf(stderr, "Saving genome..."); + start = timeInMillis(); + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); + if (!genome->saveToFile(filenameBuffer)) { + WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n"); + delete[] filenameBuffer; + return false; + } + fprintf(stderr, "%llds\n", (timeInMillis() + 500 - start) / 1000); } - fprintf(stderr,"%llds\n", (timeInMillis() + 500 - start) / 1000); GenomeIndex *index = new GenomeIndex(); index->genome = NULL; // We always delete the index when we're done, but we delete the genome first to save space during the overflow table build. - + GenomeDistance countOfBases = genome->getCountOfBases(); if (locationSize != 8 && countOfBases > ((_int64) 1 << (locationSize*8)) - 16) { WriteErrorMessage("Genome is too big for %d byte genome locations. Specify a larger location size with -locationSize\n", locationSize); @@ -419,6 +422,12 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } } + index->seedLen = seedLen; + index->hashTableKeySize = hashTableKeySize; + index->largeHashTable = large; + index->locationSize = locationSize; + index->genome = genome; + for (unsigned i = 0; i < nThreads; i++) { threadContexts[i].whichThread = i; threadContexts[i].nThreads = nThreads; @@ -697,8 +706,10 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } totalBytesWritten += bytesWrittenThisHashTable; - delete hashTables[whichHashTable]; - hashTables[whichHashTable] = NULL; + if (!(genomeHasAlts && unliftedIndex == NULL)) { + delete hashTables[whichHashTable]; + hashTables[whichHashTable] = NULL; + } } // for each hash table fclose(tablesFile); @@ -783,8 +794,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); - delete genome; - genome = NULL; if (!ok) { WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); soft_exit(1); @@ -792,6 +801,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } } + index->genome = NULL; // deleted earlier delete index; if (computeBias && biasTable != NULL) { delete[] biasTable; @@ -895,7 +905,7 @@ SNAPHashTable** GenomeIndex::allocateHashTables( -GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(FALSE), liftedIndex(NULL) +GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(false), liftedIndex(NULL) { } @@ -1258,8 +1268,7 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) if (!lift) { indexSeed(genomeLocation, seed, batches, context, &stats, large); - } - else { + } else { indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large); } } // For each genome base in our area @@ -1355,7 +1364,27 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa else { const unsigned *hits, *rcHits; context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); - CHECK_ALTS_AND_ADD_LIFTED + // CHECK_ALTS_AND_ADD_LIFTED + if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { + bool anyAlts = false; + for (int i = 0; i < nHits && !anyAlts; i++) { + anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; + } + for (int i = 0; i < nRCHits && !anyAlts; i++) { + anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; + } + if (anyAlts) { + for (int i = 0; i < nHits; i++) { + indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); + } + if (!seed.isOwnReverseComplement()) { + Seed rcSeed = ~seed; + for (int i = 0; i < nRCHits; i++) { + indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); + } + } + } + } } } @@ -1929,7 +1958,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo blobFile = NULL; } - if (!liftedIndex) { + if (liftedIndex == NULL) { snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); @@ -1956,10 +1985,12 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo soft_exit(1); } index->liftedIndex->genome = index->genome; + index->hasAlts = true; } else { index->liftedIndex = NULL; } } else { + index->hasAlts = true; index->genome = NULL; } diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index c0c30b3e..a9ca597d 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -56,6 +56,7 @@ IntersectingPairedEndAligner::IntersectingPairedEndAligner( maxSecondaryAlignmentsPerContig(maxSecondaryAlignmentsPerContig_) { doesGenomeIndexHave64BitLocations = index->doesGenomeIndexHave64BitLocations(); + doesGenomeIndexHaveAlts = index->getGenome()->hasAltContigs(); unsigned maxSeedsToUse; if (0 != numSeedsFromCommandLine) { @@ -670,7 +671,7 @@ IntersectingPairedEndAligner::align( } #endif // _DEBUG - _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore); + // !! FIX THIS BEFORE CHECKIN !! _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore); mate->scoreLimit = scoreLimit - fewerEndScore; } @@ -1275,8 +1276,8 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren anyFound = true; mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; if (actualUnliftedGenomeLocationFound != NULL) { - *actualUnliftedGenomeLocationFound = doesGenomeIndexHave64BitLocations - ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]; + *actualUnliftedGenomeLocationFound = (doesGenomeIndexHave64BitLocations + ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]) - seedOffset; } *seedOffsetFound = seedOffset; } @@ -1353,6 +1354,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren // GenomeLocation foundLocation = 0; bool anyFound = false; + const bool setUnlifted = unliftedGenomeLocation != NULL; // // Run through the lookups pushing up any that are at the most recently returned @@ -1362,6 +1364,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren _int64 *currentHitForIntersection; _int64 nHits; GenomeLocation hitLocation; + GenomeLocation unliftedHitLocation; unsigned seedOffset; // @@ -1373,6 +1376,9 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren seedOffset = lookups[i].seedOffset; \ if (nHits != *currentHitForIntersection) { \ hitLocation = lookups[i].hits[*currentHitForIntersection]; \ + if (setUnlifted) { \ + unliftedHitLocation = lookups[i].unliftedHits[*currentHitForIntersection]; \ + } \ } @@ -1392,8 +1398,14 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren } if (doesGenomeIndexHave64BitLocations) { hitLocation = lookups64[i].hits[*currentHitForIntersection]; + if (setUnlifted) { + unliftedHitLocation = lookups64[i].unliftedHits[*currentHitForIntersection]; + } } else { hitLocation = lookups32[i].hits[*currentHitForIntersection]; + if (setUnlifted) { + unliftedHitLocation = lookups32[i].unliftedHits[*currentHitForIntersection]; + } } } @@ -1402,6 +1414,9 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome { *genomeLocation = foundLocation = hitLocation - seedOffset; + if (setUnlifted) { + *unliftedGenomeLocation = unliftedHitLocation - seedOffset; + } *seedOffsetFound = seedOffset; anyFound = true; } diff --git a/tests/alttestgen.py b/tests/alttestgen.py index 16769edc..8129565b 100644 --- a/tests/alttestgen.py +++ b/tests/alttestgen.py @@ -48,11 +48,11 @@ def __str__(self): def to_sam_pair(self, other): r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( - self.id, 99, self.chr, self.pos, 60, len(self.seq), other.chr, - other.pos, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq)) + self.id, 99, self.chr, self.pos + 1, 60, len(self.seq), other.chr, + other.pos + 1, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq)) return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( - other.id, 147, other.chr, other.pos, 60, len(other.seq), self.chr, - self.pos, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq)) + other.id, 147, other.chr, other.pos + 1, 60, len(other.seq), self.chr, + self.pos + 1, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq)) class Contig: def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False): @@ -76,7 +76,7 @@ def __init__(self, contigs={}): def add(self, contig): self.contigs[contig.name] = contig - def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.1): + def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05): pc = self.contigs[parent] altseq = random_mutate(pc.seq[start:stop], pmut) if (isRC): @@ -88,21 +88,23 @@ def get_seq(self, chr, start, end): def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None): if id == None: - id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos, ('r' if isRC else 'f')) + id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f')) return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut)) def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): - id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1, chr2, pos2) + id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1) r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") return [r1, r2] def write_fasta(self, filename): with open(filename, 'w') as file: - for contig in self.contigs.values(): - file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) - for i in range(0, len(contig.seq), 80): - file.write("{}\n".format(contig.seq[i:i+80])) + for write_alts in [False, True]: + for contig in self.contigs.values(): + if contig.isAlt == write_alts: + file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) + for i in range(0, len(contig.seq), 80): + file.write("{}\n".format(contig.seq[i:i+80])) def write_alts(self, filename): with open(filename, 'w') as file: @@ -114,12 +116,14 @@ def write_alts(self, filename): 1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0)) g = Genome() -g.add(Contig("chr1", "C01", random_bases(2000))) -g.add_alt("chr1a", "C01A", "chr1", 500, 1500) +g.add(Contig("chr1", "C01", random_bases(3000))) +g.add_alt("chr1a", "C01A", "chr1", 1000, 2000) g.write_fasta("test.fa") g.write_alts("test_alts.txt") with open("test.sam", "w") as file: - for i in range(0, 101, 10): - [r1, r2] = g.make_pair('chr1', 500 + i, 'chr1a', i) + for i in range(100, 201, 20): + [r1, r2] = g.make_pair('chr1', i, 'chr1a', i) + file.write(r1.to_sam_pair(r2)) + [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000) file.write(r1.to_sam_pair(r2)) From cb7a567758f069eded69e2134ff65617216a55d7 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Wed, 9 Dec 2015 14:12:17 -0800 Subject: [PATCH 06/19] Handle minus strand alt contigs --- SNAPLib/Genome.cpp | 3 +- SNAPLib/IntersectingPairedEndAligner.cpp | 2 +- SNAPLib/SAM.cpp | 54 ++++++++++++++++++++---- tests/alttestgen.py | 21 ++++++--- 4 files changed, 64 insertions(+), 16 deletions(-) diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 06c350e1..42511534 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -311,7 +311,6 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc } genome->fillInContigLengths(); - genome->adjustAltContigs(NULL); genome->sortContigsByName(); delete[] contigNameBuffer; return genome; @@ -535,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap) // flip RC contigs for (int i = 0; i < nContigs; i++) { if (contigs[i].isAlternate && contigs[i].isAlternateRC) { - util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length); + util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length - chromosomePadding); } } } diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index a9ca597d..45d2713b 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -630,7 +630,7 @@ IntersectingPairedEndAligner::align( scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset); - _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); + // todo: fix _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); #ifdef _DEBUG if (_DumpAlignments) { diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index 19f32fcb..dfc7fb5e 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -1027,7 +1027,6 @@ SAMFormat::createSAMLine( { contigName = "*"; positionInContig = 0; - const char *cigar = "*"; templateLength = 0; if (secondaryAlignment) { @@ -1092,6 +1091,7 @@ SAMFormat::createSAMLine( contigIndex = (int)(contig - genome->getContigs()); positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based mapQuality = max(0, min(70, mapQuality)); // FIXME: manifest constant. + } else { flags |= SAM_UNMAPPED; mapQuality = 0; @@ -1228,13 +1228,17 @@ SAMFormat::writeRead( } if (genomeLocation != InvalidGenomeLocation) { - cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, - clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, - read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, - &editDistance, o_addFrontClipping); - if (*o_addFrontClipping != 0) { - return false; - } + if (!context.genome->getContigs()[contigIndex].isAlternateRC) { + cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, + clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, + read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, + &editDistance, o_addFrontClipping); + if (*o_addFrontClipping != 0) { + return false; + } + } else { + + } } @@ -1300,6 +1304,19 @@ SAMFormat::writeRead( readGroupString = read->getReadGroup(); } } + const Genome::Contig* contig = &context.genome->getContigs()[contigIndex]; + if (contig->isAlternateRC) { + // contig was reverse-complemented when building index + // so reverse flags, adjust position; CIGAR string was reversed in computeCigar + flags ^= SAM_REVERSE_COMPLEMENT; + positionInContig = 1 + max(0, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); + } + const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex]; + if (mateContig->isAlternateRC) { + // same for mate + flags ^= SAM_NEXT_REVERSED; + matePositionInContig = 1 + max(0, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); + } int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n", qnameLen, read->getId(), flags, @@ -1393,6 +1410,17 @@ SAMFormat::computeCigar( return; } + if (contig->isAlternateRC) { + // the original reference was reverse-complemented on index build to simplify alignment + // so reverse-complement both reference and data for CIGAR string + char* dataBuf = (char*)alloca(dataLength); + util::toComplement(dataBuf, data, dataLength); + data = dataBuf; + char* referenceBuf = (char*)alloca(dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K); + reference = referenceBuf; + } + *o_editDistance = lv->computeEditDistanceNormalized( reference, (int)(dataLength - *o_extraBasesClippedAfter + MAX_K), // Add space incase of indels. We know there's enough, because the reference is padded. @@ -1566,6 +1594,16 @@ SAMFormat::validateCigarString( WriteErrorMessage("validateCigarString: read alignment location isn't in a chromosome, genomeLocation %lld\n", GenomeLocationAsInt64(genomeLocation)); soft_exit(1); } + if (contig->isAlternateRC) { + // the original reference was reverse-complemented on index build to simplify alignment + // so reverse-complement both reference and data for CIGAR string + char* dataBuf = (char*)alloca(dataLength); + util::toComplement(dataBuf, data, dataLength); + data = dataBuf; + char* referenceBuf = (char*)alloca(dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K); + reference = referenceBuf; + } if (genomeLocation >= contig->beginningLocation + contig->length - genome->getChromosomePadding()) { WriteErrorMessage("validateCigarString: alignment location is in genome padding: %lld, contig name %s, base %lld, len %lld, padding size %d\n", diff --git a/tests/alttestgen.py b/tests/alttestgen.py index 8129565b..e8ce8420 100644 --- a/tests/alttestgen.py +++ b/tests/alttestgen.py @@ -84,12 +84,19 @@ def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05): self.add(Contig(name, accession, altseq, True, parent, start, isRC)) def get_seq(self, chr, start, end): - return self.contigs[chr].seq[start:end] + contig = self.contigs[chr] + if not contig.isAltRC: + return contig.seq[start:end] + else: + return rc(contig.seq[len(contig.seq) - end : len(contig.seq) - start]) def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None): if id == None: id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f')) - return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut)) + seq = random_mutate(self.get_seq(chr, pos, pos + len)) + if isRC: + seq = rc(seq) + return Read(id, chr, pos, seq, pmut) def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1) @@ -116,14 +123,18 @@ def write_alts(self, filename): 1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0)) g = Genome() -g.add(Contig("chr1", "C01", random_bases(3000))) +g.add(Contig("chr1", "C01", random_bases(5000))) g.add_alt("chr1a", "C01A", "chr1", 1000, 2000) +g.add_alt("chr1b", "C01B", "chr1", 3000, 4000, True) g.write_fasta("test.fa") g.write_alts("test_alts.txt") with open("test.sam", "w") as file: - for i in range(100, 201, 20): - [r1, r2] = g.make_pair('chr1', i, 'chr1a', i) + for i in [100, 150, 200, 250, 2100, 2150, 2200, 2250]: + if i < 2000: + [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i) + else: + [r1, r2] = g.make_pair('chr1', i, 'chr1b' , i - 2000) file.write(r1.to_sam_pair(r2)) [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000) file.write(r1.to_sam_pair(r2)) From 89a1f923858e8076ee40189cc5d324d664198ecb Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Wed, 9 Dec 2015 14:23:04 -0800 Subject: [PATCH 07/19] Allow extra columns in alt map --- SNAPLib/Genome.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 42511534..5bac7d14 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -717,7 +717,7 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum if (endOfFile) { break; } - switch (columnTypes[columnIndex]) { + switch (columnIndex < columnTypes.size() ? columnTypes[columnIndex] : N_COLUMNS) { case ALT_SCAF_ACC: alt.accession = p; break; From 42717f9d6b863dc99b9825fbaae241d9facf452b Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Wed, 9 Dec 2015 15:31:58 -0800 Subject: [PATCH 08/19] Compile on Linux --- SNAPLib/BaseAligner.cpp | 4 ++-- SNAPLib/Genome.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index 02efa6c0..d9fdef43 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -797,7 +797,7 @@ Return Value: unsigned weightListToCheck = highestUsedWeightList; MatchInfoVector* allMatches = NULL; - bool anyAltMatches = FALSE; + bool anyAltMatches = false; if (genome->hasAltContigs()) { allMatches = new MatchInfoVector(); } @@ -977,7 +977,7 @@ Return Value: // remember in case there are alt matches if (allMatches != NULL) { if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) { - anyAltMatches = TRUE; + anyAltMatches = true; } allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability)); } diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index d65d9008..fa593890 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -246,7 +246,7 @@ class Genome { struct Contig { Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL), - isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {} + isAlternate(false), isReverseStrand(false), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {} GenomeLocation beginningLocation; GenomeDistance length; bool isAlternate; @@ -267,7 +267,7 @@ class Genome { const Contig *getNextContigAfterLocation(GenomeLocation location) const; int getContigNumAtLocation(GenomeLocation location) const; // Returns the contig number, which runs from 0 .. getNumContigs() - 1. - inline bool hasAltContigs() const { return FALSE; } // todo: implement + inline bool hasAltContigs() const { return false; } // todo: implement GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation; } // todo: implement From caa79174fcf57632ef8cc6985e8c80ba3eea511d Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Wed, 9 Dec 2015 16:17:50 -0800 Subject: [PATCH 09/19] Compile on Linux --- SNAPLib/Genome.cpp | 12 ++++++------ SNAPLib/GenomeIndex.cpp | 4 ++-- SNAPLib/SAM.cpp | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 5bac7d14..fee4a750 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -277,7 +277,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc } genome->contigs[i].liftedLocation = liftedLocation; - if (isAlternate && contigStart < genome->minAltLocation.location) { + if (isAlternate && contigStart < (_int64)genome->minAltLocation) { genome->minAltLocation = contigStart - chromosomePadding / 2; } } @@ -534,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap) // flip RC contigs for (int i = 0; i < nContigs; i++) { if (contigs[i].isAlternate && contigs[i].isAlternateRC) { - util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length - chromosomePadding); + util::toComplement(bases + (_int64)contigs[i].beginningLocation, NULL, (int) contigs[i].length - chromosomePadding); } } } @@ -651,7 +651,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum } char* q = strchr(p, ','); if (q == NULL) { -err_invalid_column_spec: WriteErrorMessage("Invalid columns spec %s\n", columns); soft_exit(1); } @@ -674,7 +673,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum } else { q = p + strlen(p); if (i < PARENT_STOP) { - goto err_invalid_column_spec; + WriteErrorMessage("Invalid columns spec %s\n", columns); + soft_exit(1); } break; } @@ -689,7 +689,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum for (int columnIndex = 0; !endOfLine; columnIndex++) { q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile); if (q == NULL) { -err_file_format: WriteErrorMessage("Invalid file format for alt data in %s\n", filename); soft_exit(1); } @@ -706,7 +705,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum } for (int i = 0; i < N_COLUMNS; i++) { if (columnNames[i] != NULL && !columnFound[i]) { - goto err_file_format; + WriteErrorMessage("Invalid file format for alt data in %s\n", filename); + soft_exit(1); } } while (!endOfFile) { diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index fa110167..e3b672f8 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -792,7 +792,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla if (genomeHasAlts && unliftedIndex == NULL) { // create a sub-index with only seeds that occur in alt contigs snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); - bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, + bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); if (!ok) { WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); @@ -1958,7 +1958,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo blobFile = NULL; } - if (liftedIndex == NULL) { + if (!liftedIndex) { snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName); if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) { WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n"); diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index dfc7fb5e..7908aea1 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -1309,13 +1309,13 @@ SAMFormat::writeRead( // contig was reverse-complemented when building index // so reverse flags, adjust position; CIGAR string was reversed in computeCigar flags ^= SAM_REVERSE_COMPLEMENT; - positionInContig = 1 + max(0, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); + positionInContig = 1 + max(0L, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); } const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex]; if (mateContig->isAlternateRC) { // same for mate flags ^= SAM_NEXT_REVERSED; - matePositionInContig = 1 + max(0, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); + matePositionInContig = 1 + max(0L, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); } int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n", qnameLen, read->getId(), From 58069569cd4bf84da0c53bc3d4569a33a916de91 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Wed, 9 Dec 2015 18:03:48 -0800 Subject: [PATCH 10/19] Fix windows compile now --- SNAPLib/Genome.cpp | 4 ++-- SNAPLib/SAM.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index fee4a750..39428568 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -277,7 +277,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc } genome->contigs[i].liftedLocation = liftedLocation; - if (isAlternate && contigStart < (_int64)genome->minAltLocation) { + if (isAlternate && contigStart < GenomeLocationAsInt64(genome->minAltLocation)) { genome->minAltLocation = contigStart - chromosomePadding / 2; } } @@ -534,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap) // flip RC contigs for (int i = 0; i < nContigs; i++) { if (contigs[i].isAlternate && contigs[i].isAlternateRC) { - util::toComplement(bases + (_int64)contigs[i].beginningLocation, NULL, (int) contigs[i].length - chromosomePadding); + util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding); } } } diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index 7908aea1..c2143529 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -1414,10 +1414,10 @@ SAMFormat::computeCigar( // the original reference was reverse-complemented on index build to simplify alignment // so reverse-complement both reference and data for CIGAR string char* dataBuf = (char*)alloca(dataLength); - util::toComplement(dataBuf, data, dataLength); + util::toComplement(dataBuf, data, (int)dataLength); data = dataBuf; char* referenceBuf = (char*)alloca(dataLength + MAX_K); - util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); reference = referenceBuf; } @@ -1598,10 +1598,10 @@ SAMFormat::validateCigarString( // the original reference was reverse-complemented on index build to simplify alignment // so reverse-complement both reference and data for CIGAR string char* dataBuf = (char*)alloca(dataLength); - util::toComplement(dataBuf, data, dataLength); + util::toComplement(dataBuf, data, (int)dataLength); data = dataBuf; char* referenceBuf = (char*)alloca(dataLength + MAX_K); - util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); reference = referenceBuf; } From 75b3361f22e9a8d69ac5ff16e6e61164d94309d4 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Sun, 13 Dec 2015 07:16:10 -0800 Subject: [PATCH 11/19] Fix alt index build --- SNAPLib/Genome.cpp | 5 ++--- SNAPLib/GenomeIndex.cpp | 29 +++++------------------------ 2 files changed, 7 insertions(+), 27 deletions(-) diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 39428568..2869008b 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -593,7 +593,6 @@ char* tokenizeToNextTabOrNewline(char* start, bool* endOfLine, bool* endOfFile) } else if (*p == '\r' || *p == '\n') { if (*(p + 1) != *p && (*(p + 1) == '\r' || *(p + 1) == '\n')) { *p++ = '\0'; - } else { } *p = '\0'; *endOfLine = true; @@ -694,11 +693,11 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum } for (int i = 0; i <= N_COLUMNS; i++) { if (i < N_COLUMNS && !strcmp(columnNames[i], p)) { - columnTypes.add(i); + columnTypes.push_back(i); columnFound[i] = true; break; } else if (i == N_COLUMNS) { - columnTypes.add(N_COLUMNS); // ignore this column + columnTypes.push_back(N_COLUMNS); // ignore this column } } p = q; diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index e3b672f8..4db0fef1 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -1339,7 +1339,8 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa GenomeLocation singleHit[2], singleRCHit[2]; context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); #define CHECK_ALTS_AND_ADD_LIFTED \ - if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { \ + if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ + (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ bool anyAlts = false; \ for (int i = 0; i < nHits && ! anyAlts; i++) { \ anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \ @@ -1348,12 +1349,12 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \ } \ if (anyAlts) { \ - for (int i = 0; i < nHits && !anyAlts; i++) { \ + for (int i = 0; i < nHits; i++) { \ indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); \ } \ if (!seed.isOwnReverseComplement()) { \ Seed rcSeed = ~seed; \ - for (int i = 0; i < nRCHits && !anyAlts; i++) { \ + for (int i = 0; i < nRCHits; i++) { \ indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); \ } \ } \ @@ -1364,27 +1365,7 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa else { const unsigned *hits, *rcHits; context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); - // CHECK_ALTS_AND_ADD_LIFTED - if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { - bool anyAlts = false; - for (int i = 0; i < nHits && !anyAlts; i++) { - anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; - } - for (int i = 0; i < nRCHits && !anyAlts; i++) { - anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; - } - if (anyAlts) { - for (int i = 0; i < nHits; i++) { - indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); - } - if (!seed.isOwnReverseComplement()) { - Seed rcSeed = ~seed; - for (int i = 0; i < nRCHits; i++) { - indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); - } - } - } - } + CHECK_ALTS_AND_ADD_LIFTED } } From 237186c8b5af0cff8220c79a41a03eebeec1c3ac Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Fri, 8 Jan 2016 10:31:04 -0800 Subject: [PATCH 12/19] Coordinate sort of lifted/unlifted index; fix RC handling; secondary results --- SNAPLib/BaseAligner.cpp | 1 + SNAPLib/Genome.cpp | 12 +- SNAPLib/Genome.h | 5 +- SNAPLib/GenomeIndex.cpp | 146 +++++++++++++++++++---- SNAPLib/GenomeIndex.h | 4 +- SNAPLib/IntersectingPairedEndAligner.cpp | 73 +++++++++--- SNAPLib/IntersectingPairedEndAligner.h | 3 + SNAPLib/SAM.cpp | 122 +++++++++---------- tests/alttestgen.py | 48 ++++---- 9 files changed, 272 insertions(+), 142 deletions(-) diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index 39c3489f..cd6b72eb 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -227,6 +227,7 @@ Routine Description: hashTableEpoch = 0; if (genome->hasAltContigs()) { + // todo: BigAlloc / new(allocator) -> fixed size, avoid reallocs; reserve space for max size allMatches = new MatchInfoVector(); } diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 2869008b..8a6a4fb4 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -506,7 +506,8 @@ void Genome::adjustAltContigs(AltContigMap* altMap) if (contigs[i].beginningLocation < minAltLocation) { minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2; } - const char* parentName = altMap->getParentContigName(contigs[i].name); + GenomeDistance offset; + const char* parentName = altMap->getParentContigName(contigs[i].name, &offset); if (parentName == NULL) { WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name); error = true; @@ -523,7 +524,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap) WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName); error = true; continue; } - contigs[i].liftedLocation = parentLocation; + contigs[i].liftedLocation = parentLocation + offset; } } if (error) { @@ -534,7 +535,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap) // flip RC contigs for (int i = 0; i < nContigs; i++) { if (contigs[i].isAlternate && contigs[i].isAlternateRC) { - util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding); + util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding); } } } @@ -805,7 +806,7 @@ void AltContigMap::setAltContig(Genome::Contig* contig) contig->isAlternateRC = false; } -const char* AltContigMap::getParentContigName(const char* altName) +const char* AltContigMap::getParentContigName(const char* altName, GenomeDistance* pOffset) { StringMap::iterator accession = nameToAccession.find(altName); if (accession != nameToAccession.end()) { @@ -813,6 +814,9 @@ const char* AltContigMap::getParentContigName(const char* altName) if (alt != altsByAccession.end()) { StringMap::iterator parent = accessionToName.find(alt->second.parentAccession); if (parent != accessionToName.end()) { + if (pOffset != NULL) { + *pOffset = alt->second.parentStart - alt->second.start; + } return parent->second.data(); } } diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index 38c8d087..c01a3b61 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -276,6 +276,9 @@ class Genome { GenomeLocation getLiftedLocation(GenomeLocation altLocation) const; + inline bool isAltLocation(GenomeLocation location) const + { return location != InvalidGenomeLocation && location >= minAltLocation && getLiftedLocation(location) != location; } + // unused Genome *copy() const {return copy(true,true,true);} // unused Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);} @@ -337,7 +340,7 @@ class AltContigMap void setAltContig(Genome::Contig* contig); - const char* getParentContigName(const char* altName); + const char* getParentContigName(const char* altName, GenomeDistance* pOffset = NULL); private: diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 4db0fef1..4ae13022 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -387,9 +387,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla volatile _int64 nBasesProcessed = 0; volatile int runningThreadCount; - SingleWaiterObject doneObject; - CreateSingleWaiterObject(&doneObject); - unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads); BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads]; @@ -398,6 +395,14 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla InitializeExclusiveLock(&hashTableLocks[i]); } + // lifted index needs to be done in two passes, first to build and then to sort + int liftedIndexPass = 0; + +lifted_index_pass_start: + + SingleWaiterObject doneObject; + CreateSingleWaiterObject(&doneObject); + runningThreadCount = nThreads; GenomeDistance nextChunkToProcess = 0; @@ -459,6 +464,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread; threadContexts[i].backpointerSpillFile = backpointerSpillFile; threadContexts[i].unliftedIndex = unliftedIndex; + threadContexts[i].liftedIndexPass = liftedIndexPass; StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]); } @@ -496,8 +502,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // We're done with the raw genome. Delete it to save some memory. // - bool genomeHasAlts = genome->hasAltContigs(); - if (! (genomeHasAlts && unliftedIndex == NULL)) { + if (!genome->hasAltContigs()) { // delete if we won't need it later delete genome; genome = NULL; @@ -535,6 +540,10 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000); } + if (unliftedIndex != NULL && liftedIndexPass == 1) { + goto lifted_skip_overflow; + } + WriteStatusMessage("Building overflow table.\n"); start = timeInMillis(); fflush(stdout); @@ -706,7 +715,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla } totalBytesWritten += bytesWrittenThisHashTable; - if (!(genomeHasAlts && unliftedIndex == NULL)) { + if (genome == NULL || !genome->hasAltContigs()) { delete hashTables[whichHashTable]; hashTables[whichHashTable] = NULL; } @@ -716,6 +725,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla _ASSERT(overflowTableIndex == index->overflowTableSize); // We used exactly what we expected to use. + if (unliftedIndex != NULL && liftedIndexPass == 0) { + liftedIndexPass = 1; + goto lifted_index_pass_start; + } + +lifted_skip_overflow: + delete overflowAnchor; overflowAnchor = NULL; @@ -783,13 +799,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", // NOTE: this must be changed if the format no longer supports v5 (pre-alt) - genomeHasAlts ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts, + genome != NULL && genome->hasAltContigs() ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts, GenomeIndexFormatMinorVersion, index->nHashTables, index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize); fclose(indexFile); - if (genomeHasAlts && unliftedIndex == NULL) { + if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) { // create a sub-index with only seeds that occur in alt contigs snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, @@ -1264,12 +1280,18 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) continue; } - Seed seed(bases, seedLen); + Seed seed(bases, seedLen); if (!lift) { indexSeed(genomeLocation, seed, batches, context, &stats, large); } else { - indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + // in the lifted case, we first do one pass to index lifted seeds + // and then another pass to sort the unlifted locations by the lifted locations so they correspond + if (context->liftedIndexPass == 0) { + indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + } else { + resortLiftedSeed(genomeLocation, seed, batches, context, &stats, large); + } } } // For each genome base in our area @@ -1295,7 +1317,6 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context) const _int64 GenomeIndex::printPeriod = 100000000; - void GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) { @@ -1308,28 +1329,27 @@ GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBat _ASSERT(whichHashTable < nHashTables); if (batches[whichHashTable].addSeed(genomeLocation, seed.getLowBases(context->hashTableKeySize), usingComplement)) { - AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]); - for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) { - ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation, - batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement, - &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large); - } - ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]); + AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]); + for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) { + ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation, + batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement, + &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large); + } + ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]); - _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds); + _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds); - if ((unsigned)(newNBasesProcessed / printPeriod) > (unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) { - WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases()); - } - stats->unrecordedSkippedSeeds = 0; - batches[whichHashTable].clear(); - } // If we filled a batch + if ((unsigned)(newNBasesProcessed / printPeriod) >(unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) { + WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases()); + } + stats->unrecordedSkippedSeeds = 0; + batches[whichHashTable].clear(); + } // If we filled a batch } void GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) { - // todo: optimize // if this is first occurrence of seed in unlifted index, checks if seed is in any alts // and if so, adds all locations to this index, lifting alts to non-alt locations @@ -1368,6 +1388,80 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa CHECK_ALTS_AND_ADD_LIFTED } } + + void +dualSort32( + _int64 n, + unsigned* keys, + unsigned* values) +{ + // todo: optimize sorting, just using a simple selection sort for now + unsigned t; +#define DUAL_SORT \ + if (n < 2) { \ + return; \ + } \ + for (_int64 i = 0; i < n - 1; i++) { \ + for (_int64 j = n - 1; j > i; j--) { \ + if (keys[i] > keys[j]) { \ + t = keys[i]; \ + keys[i] = keys[j]; \ + keys[j] = t; \ + t = values[i]; \ + values[i] = values[j]; \ + values[j] = t; \ + } \ + } \ + } + DUAL_SORT +} + + void +dualSort( + _int64 n, + GenomeLocation* keys, + GenomeLocation* values) +{ + GenomeLocation t; + DUAL_SORT +} + + void +GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) +{ + // redo lifting and then sort both lists by lifted location + // NOTE: this leaves the unlifted list in a non-sorted order + + _int64 nHits, nRCHits; + _int64 nLiftedHits, nLiftedRCHits; + if (doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + const GenomeLocation *liftedHits, *liftedRCHits; + GenomeLocation liftedSingleHit[2], liftedSingleRCHit[2]; + lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, &liftedSingleHit[1], &liftedSingleRCHit[1]); + if (nLiftedHits > 1 || nLiftedRCHits > 1) { + context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); + _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); + if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) { + dualSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits); + dualSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits); + } + } + } else { + const unsigned *hits, *rcHits; + const unsigned *liftedHits, *liftedRCHits; + lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits); + if (nLiftedHits > 1 || nLiftedRCHits > 1) { + context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); + if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) { + dualSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits); + dualSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits); + } + } + } +} void GenomeIndex::ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement, diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index ca45201e..c0a45840 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -244,6 +244,7 @@ class GenomeIndex { // used for building sub-index of only seeds that occur in alt contigs GenomeIndex *unliftedIndex; + int liftedIndexPass; ExclusiveLock *hashTableLocks; ExclusiveLock *overflowTableLock; @@ -294,8 +295,9 @@ class GenomeIndex { virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); virtual void indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); + virtual void resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large); - + static void BuildHashTablesWorkerThreadMain(void *param); void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context); static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement, diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index 45d2713b..c1cc00ec 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -33,6 +33,9 @@ Revision History: extern bool _DumpAlignments; // From BaseAligner.cpp #endif // _DEBUG +static const double EPSILON_FACTOR_HI = 1.0000000001; +static const double EPSILON_FACTOR_LO = 0.9999999999; + IntersectingPairedEndAligner::IntersectingPairedEndAligner( GenomeIndex *index_, unsigned maxReadSize_, @@ -202,6 +205,7 @@ IntersectingPairedEndAligner::align( GenomeLocation bestResultGenomeLocation[NUM_READS_PER_PAIR]; Direction bestResultDirection[NUM_READS_PER_PAIR]; unsigned bestResultScore[NUM_READS_PER_PAIR]; + bool bestPairHasAlts = false; unsigned popularSeedsSkipped[NUM_READS_PER_PAIR]; reads[0][FORWARD] = read0; @@ -630,7 +634,7 @@ IntersectingPairedEndAligner::align( scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset); - // todo: fix _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); + _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore); #ifdef _DEBUG if (_DumpAlignments) { @@ -671,7 +675,7 @@ IntersectingPairedEndAligner::align( } #endif // _DEBUG - // !! FIX THIS BEFORE CHECKIN !! _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore); + _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore); mate->scoreLimit = scoreLimit - fewerEndScore; } @@ -694,6 +698,7 @@ IntersectingPairedEndAligner::align( // because it's a worse version of this location. // MergeAnchor *mergeAnchor = candidate->mergeAnchor; + MergeAnchor *unliftedMergeAnchor = candidate->unliftedMergeAnchor; if (NULL == mergeAnchor) { // @@ -707,6 +712,7 @@ IntersectingPairedEndAligner::align( if (mergeCandidate->mergeAnchor != NULL) { candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor; + candidate->unliftedMergeAnchor = mergeAnchor = mergeCandidate->unliftedMergeAnchor; break; } } @@ -716,10 +722,11 @@ IntersectingPairedEndAligner::align( mergeCandidate < scoringCandidatePool + lowestFreeScoringCandidatePoolEntry && genomeLocationIsWithin(mergeCandidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, 50) && mergeCandidate->whichSetPair == candidate->whichSetPair; - mergeCandidate--) { + mergeCandidate++) { if (mergeCandidate->mergeAnchor != NULL) { candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor; + candidate->unliftedMergeAnchor = unliftedMergeAnchor = mergeCandidate->unliftedMergeAnchor; break; } } @@ -727,11 +734,12 @@ IntersectingPairedEndAligner::align( } bool merged; + bool mergedUnlifted; double oldPairProbability; if (NULL == mergeAnchor) { - if (firstFreeMergeAnchor >= mergeAnchorPoolSize) { + if (firstFreeMergeAnchor >= mergeAnchorPoolSize - doesGenomeIndexHaveAlts) { WriteErrorMessage("Ran out of merge anchor pool entries. Perhaps rerunning with a larger value of -mcp will help\n"); soft_exit(1); } @@ -744,30 +752,45 @@ IntersectingPairedEndAligner::align( pairProbability, pairScore); merged = false; + mergedUnlifted = false; oldPairProbability = 0; candidate->mergeAnchor = mergeAnchor; + if (doesGenomeIndexHaveAlts) { + unliftedMergeAnchor = &mergeAnchorPool[firstFreeMergeAnchor]; + candidate->unliftedMergeAnchor = unliftedMergeAnchor; + firstFreeMergeAnchor++; + unliftedMergeAnchor->init(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, + pairProbability, pairScore); + } } else { merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, pairProbability, pairScore, doesGenomeIndexHaveAlts && (! candidate->isAlt()) && (!mate->isAlt()), &oldPairProbability); + if (unliftedMergeAnchor != NULL) { + double ignore; + mergedUnlifted = merged && unliftedMergeAnchor->checkMerge(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, + pairProbability, pairScore, false, &ignore); + } } - if (!merged) { + if (!(merged && mergedUnlifted)) { // // Back out the probability of the old match that we're merged with, if any. The max // is necessary because a + b - b is not necessarily a in floating point. If there // was no merge, the oldPairProbability is 0. // - probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability); - + if (!merged) { + probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability); + } bool isBestHit = false; if (pairScore <= maxK && (pairScore < bestPairScore || - (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair || - (pairProbability == probabilityOfBestPair && (! candidate->isAlt()) && (!mate->isAlt())))))) { + (pairScore == bestPairScore && (pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_HI || + (bestPairHasAlts && pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_LO && (!candidate->isAlt()) && (!mate->isAlt())))))) { // // A new best hit. // - if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= pairScore - bestPairScore) { + // Code review note: was pairScore-bestPairScore which is negative int, i.e. very large unsigned, so would only save secondary w/equal score + if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= bestPairScore - pairScore) { // // Move the old best to be a secondary alignment. This won't happen on the first time we get a valid alignment, // because bestPairScore is initialized to be very large. @@ -801,6 +824,7 @@ IntersectingPairedEndAligner::align( bestResultScore[readWithMoreHits] = mate->score; bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits]; bestResultDirection[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits]; + bestPairHasAlts = candidate->isAlt() || mate->isAlt(); if (!noUkkonen) { scoreLimit = bestPairScore + extraSearchDepth; @@ -833,7 +857,9 @@ IntersectingPairedEndAligner::align( } } - probabilityOfAllPairs += pairProbability; + if (!merged) { + probabilityOfAllPairs += pairProbability; + } #ifdef _DEBUG if (_DumpAlignments) { printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n", @@ -1272,14 +1298,21 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren unsigned clause2 = probe == 0; if (clause1 && (clause2 || probeMinusOneHit > maxGenomeLocationToFindThisSeed)) { - if (probeHit - seedOffset > bestLocationFound) { - anyFound = true; - mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; - if (actualUnliftedGenomeLocationFound != NULL) { - *actualUnliftedGenomeLocationFound = (doesGenomeIndexHave64BitLocations - ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]) - seedOffset; + if (actualUnliftedGenomeLocationFound == NULL) { + if (probeHit - seedOffset > bestLocationFound) { + anyFound = true; + mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; + *seedOffsetFound = seedOffset; + } + } else { + GenomeLocation bestUnliftedLocationFound = doesGenomeIndexHave64BitLocations ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]; + if (probeHit - seedOffset > bestLocationFound || + (probeHit - seedOffset == bestLocationFound && *actualUnliftedGenomeLocationFound != bestUnliftedLocationFound)) { + anyFound = true; + mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset; + *actualUnliftedGenomeLocationFound = bestUnliftedLocationFound - seedOffset; + *seedOffsetFound = seedOffset; } - *seedOffsetFound = seedOffset; } if (doesGenomeIndexHave64BitLocations) { @@ -1449,7 +1482,9 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL // Within merge distance. Keep the better score (or if they're tied the better match probability). // if (newPairScore < pairScore || (newPairScore == pairScore && - (newMatchProbability > matchProbability || (newMatchProbability == matchProbability && newPairIsNonAlt)))) { + (newMatchProbability >= matchProbability*EPSILON_FACTOR_HI || + (newMatchProbability >= matchProbability*EPSILON_FACTOR_LO && newPairIsNonAlt && + (newMoreHitLocation != locationForReadWithMoreHits || newFewerHitLocation != locationForReadWithFewerHits))))) { #ifdef _DEBUG if (_DumpAlignments) { printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n", diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h index 403e28e6..39c258ef 100644 --- a/SNAPLib/IntersectingPairedEndAligner.h +++ b/SNAPLib/IntersectingPairedEndAligner.h @@ -407,6 +407,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_, GenomeLocation readWithMoreHitsUnliftedGenomeLocation_) { readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_; readWithMoreHitsUnliftedGenomeLocation = readWithMoreHitsUnliftedGenomeLocation_; + _ASSERT(readWithMoreHitsUnliftedGenomeLocation != -1); bestPossibleScore = bestPossibleScore_; seedOffset = seedOffset_; score = -2; @@ -420,6 +421,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner struct ScoringCandidate { ScoringCandidate * scoreListNext; // This is a singly-linked list MergeAnchor * mergeAnchor; + MergeAnchor * unliftedMergeAnchor; unsigned scoringMateCandidateIndex; // Index into the array of scoring mate candidates where we should look GenomeLocation readWithFewerHitsGenomeLocation; GenomeLocation readWithFewerHitsUnliftedGenomeLocation; @@ -440,6 +442,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner bestPossibleScore = bestPossibleScore_; scoreListNext = scoreListNext_; mergeAnchor = NULL; + unliftedMergeAnchor = NULL; } bool isAlt() const { return readWithFewerHitsGenomeLocation != readWithFewerHitsUnliftedGenomeLocation; } }; diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp index c2143529..0317f139 100644 --- a/SNAPLib/SAM.cpp +++ b/SNAPLib/SAM.cpp @@ -1062,23 +1062,8 @@ SAMFormat::createSAMLine( return false; } - if (direction == RC) { - for (unsigned i = 0; i < fullLength; i++) { - data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]]; - quality[fullLength - 1 - i] = read->getUnclippedQuality()[i]; - } - clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()]; - basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength(); - basesClippedAfter = read->getFrontClippedLength(); - } else { - memcpy(data, read->getUnclippedData(), read->getUnclippedLength()); - memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength()); - clippedData = read->getData(); - basesClippedBefore = read->getFrontClippedLength(); - basesClippedAfter = fullLength - clippedLength - basesClippedBefore; - } - int editDistance = -1; + const Genome::Contig* contig = NULL; if (genomeLocation != InvalidGenomeLocation) { if (direction == RC) { flags |= SAM_REVERSE_COMPLEMENT; @@ -1092,12 +1077,35 @@ SAMFormat::createSAMLine( positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based mapQuality = max(0, min(70, mapQuality)); // FIXME: manifest constant. + if (contig->isAlternateRC) { + // contig was reverse-complemented when building index + flags ^= SAM_REVERSE_COMPLEMENT; + positionInContig = 1 + max(0L, (contig->length - genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); + direction = direction == RC ? FORWARD : RC; + } } else { flags |= SAM_UNMAPPED; mapQuality = 0; *extraBasesClippedBefore = 0; } + if (direction == RC) { + for (unsigned i = 0; i < fullLength; i++) { + data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]]; + quality[fullLength - 1 - i] = read->getUnclippedQuality()[i]; + } + clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()]; + basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength(); + basesClippedAfter = read->getFrontClippedLength(); + } else { + memcpy(data, read->getUnclippedData(), read->getUnclippedLength()); + memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength()); + clippedData = read->getData(); + basesClippedBefore = read->getFrontClippedLength(); + basesClippedAfter = fullLength - clippedLength - basesClippedBefore; + } + + if (hasMate) { flags |= SAM_MULTI_SEGMENT; flags |= (firstInPair ? SAM_FIRST_SEGMENT : SAM_LAST_SEGMENT); @@ -1112,6 +1120,11 @@ SAMFormat::createSAMLine( if (mateDirection == RC) { flags |= SAM_NEXT_REVERSED; } + if (mateContig->isAlternateRC) { + // mate contig was reverse-complemented when building index + flags ^= SAM_NEXT_REVERSED; + matePositionInContig = 1 + max(0L, (mateContig->length - genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); + } if (genomeLocation == InvalidGenomeLocation) { // @@ -1138,16 +1151,17 @@ SAMFormat::createSAMLine( if (alignedAsPair) { flags |= SAM_ALL_ALIGNED; } - // Also compute the length of the whole paired-end string whose ends we saw. This is slightly - // tricky because (a) we may have clipped some bases before/after each end and (b) we need to - // give a signed result based on whether our read is first or second in the pair. - GenomeLocation myStart = genomeLocation - basesClippedBefore; - GenomeLocation myEnd = genomeLocation + clippedLength + basesClippedAfter; - _int64 mateBasesClippedBefore = mate->getFrontClippedLength(); - _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore; - GenomeLocation mateStart = mateLocation - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore); - GenomeLocation mateEnd = mateLocation + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore); - if (contigName == matecontigName) { // pointer (not value) comparison, but that's OK. + // todo: should this look at lifted locations for alt contigs that map to same non-alt contig? + if (contigIndex == mateContigIndex) { + // Also compute the length of the whole paired-end string whose ends we saw. This is slightly + // tricky because (a) we may have clipped some bases before/after each end and (b) we need to + // give a signed result based on whether our read is first or second in the pair. + GenomeDistance myStart = positionInContig - basesClippedBefore; + GenomeDistance myEnd = positionInContig + clippedLength + basesClippedAfter; + _int64 mateBasesClippedBefore = mate->getFrontClippedLength(); + _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore; + GenomeDistance mateStart = matePositionInContig - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore); + GenomeDistance mateEnd = matePositionInContig + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore); if (myStart < mateStart) { templateLength = mateEnd - myStart; } else { @@ -1228,16 +1242,12 @@ SAMFormat::writeRead( } if (genomeLocation != InvalidGenomeLocation) { - if (!context.genome->getContigs()[contigIndex].isAlternateRC) { - cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, - clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, - read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, - &editDistance, o_addFrontClipping); - if (*o_addFrontClipping != 0) { - return false; - } - } else { - + cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize, + clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, + read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM, + &editDistance, o_addFrontClipping); + if (*o_addFrontClipping != 0) { + return false; } } @@ -1304,19 +1314,6 @@ SAMFormat::writeRead( readGroupString = read->getReadGroup(); } } - const Genome::Contig* contig = &context.genome->getContigs()[contigIndex]; - if (contig->isAlternateRC) { - // contig was reverse-complemented when building index - // so reverse flags, adjust position; CIGAR string was reversed in computeCigar - flags ^= SAM_REVERSE_COMPLEMENT; - positionInContig = 1 + max(0L, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength); - } - const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex]; - if (mateContig->isAlternateRC) { - // same for mate - flags ^= SAM_NEXT_REVERSED; - matePositionInContig = 1 + max(0L, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength); - } int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n", qnameLen, read->getId(), flags, @@ -1388,6 +1385,16 @@ SAMFormat::computeCigar( const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation); + const char *reference = genome->getSubstring(genomeLocation, dataLength); + if (contig->isAlternateRC) { + // the original reference was reverse-complemented on index build to simplify alignment + // so reverse-complement reference for CIGAR string + // data was already flipped in createSAMLine if needed + char* referenceBuf = (char*)alloca(dataLength + MAX_K); + util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); + reference = referenceBuf; + } + if (genomeLocation + dataLength > contig->beginningLocation + contig->length - genome->getChromosomePadding()) { // // The read hangs off the end of the contig. Soft clip it at the end. This is a tentative amount that assumes no net indels in the @@ -1398,7 +1405,6 @@ SAMFormat::computeCigar( *o_extraBasesClippedAfter = 0; } - const char *reference = genome->getSubstring(genomeLocation, dataLength); if (NULL == reference) { // // Fell off the end of the contig. @@ -1410,16 +1416,6 @@ SAMFormat::computeCigar( return; } - if (contig->isAlternateRC) { - // the original reference was reverse-complemented on index build to simplify alignment - // so reverse-complement both reference and data for CIGAR string - char* dataBuf = (char*)alloca(dataLength); - util::toComplement(dataBuf, data, (int)dataLength); - data = dataBuf; - char* referenceBuf = (char*)alloca(dataLength + MAX_K); - util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); - reference = referenceBuf; - } *o_editDistance = lv->computeEditDistanceNormalized( reference, @@ -1596,10 +1592,8 @@ SAMFormat::validateCigarString( } if (contig->isAlternateRC) { // the original reference was reverse-complemented on index build to simplify alignment - // so reverse-complement both reference and data for CIGAR string - char* dataBuf = (char*)alloca(dataLength); - util::toComplement(dataBuf, data, (int)dataLength); - data = dataBuf; + // so reverse-complement reference for CIGAR string + // data was already flipped in createSAMLine if needed char* referenceBuf = (char*)alloca(dataLength + MAX_K); util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K); reference = referenceBuf; diff --git a/tests/alttestgen.py b/tests/alttestgen.py index e8ce8420..20640293 100644 --- a/tests/alttestgen.py +++ b/tests/alttestgen.py @@ -46,14 +46,6 @@ def __init__(self, id, chr, pos, seq, qual=None): def __str__(self): return "Read({}, {}, {}, {})".format(self.id, self.chr, self.pos, self.seq) - def to_sam_pair(self, other): - r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( - self.id, 99, self.chr, self.pos + 1, 60, len(self.seq), other.chr, - other.pos + 1, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq)) - return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( - other.id, 147, other.chr, other.pos + 1, 60, len(other.seq), self.chr, - self.pos + 1, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq)) - class Contig: def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False): self.name = name @@ -83,35 +75,37 @@ def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05): altseq = rc(altseq) self.add(Contig(name, accession, altseq, True, parent, start, isRC)) - def get_seq(self, chr, start, end): - contig = self.contigs[chr] - if not contig.isAltRC: - return contig.seq[start:end] - else: - return rc(contig.seq[len(contig.seq) - end : len(contig.seq) - start]) - - def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None): + def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None): if id == None: - id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f')) - seq = random_mutate(self.get_seq(chr, pos, pos + len)) - if isRC: - seq = rc(seq) - return Read(id, chr, pos, seq, pmut) + id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f')) + seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut) + return Read(id, chr, pos, seq) def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): - id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1) + id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1) r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") return [r1, r2] + def to_sam_pair(self, read1, read2): + rc1 = 1 if self.contigs[read1.chr].isAltRC else 0 + rc2 = 0 if self.contigs[read2.chr].isAltRC else 1 + r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + read1.id, 67+16*rc1+32*rc2, read1.chr, read1.pos + 1, 60, len(read1.seq), read2.chr, + read2.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read1.seq, (['ABCD','DCBA'][rc1]*int(len(read1.seq)/4+1))[:len(read1.seq)]) + return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format( + read2.id, 131+16*rc2+32*rc1, read2.chr, read2.pos + 1, 60, len(read2.seq), read1.chr, + read1.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read2.seq, (['ABCD','DCBA'][rc2]*int(len(read2.seq)/4+1))[:len(read2.seq)]) + def write_fasta(self, filename): with open(filename, 'w') as file: for write_alts in [False, True]: for contig in self.contigs.values(): if contig.isAlt == write_alts: file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) - for i in range(0, len(contig.seq), 80): - file.write("{}\n".format(contig.seq[i:i+80])) + LINE_LEN=100 + for i in range(0, len(contig.seq), LINE_LEN): + file.write("{}\n".format(contig.seq[i:i+LINE_LEN])) def write_alts(self, filename): with open(filename, 'w') as file: @@ -134,7 +128,7 @@ def write_alts(self, filename): if i < 2000: [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i) else: - [r1, r2] = g.make_pair('chr1', i, 'chr1b' , i - 2000) - file.write(r1.to_sam_pair(r2)) + [r1, r2] = g.make_pair('chr1', i, 'chr1b' , 900 - (i - 2000)) + file.write(g.to_sam_pair(r1,r2)) [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000) - file.write(r1.to_sam_pair(r2)) + file.write(g.to_sam_pair(r1,r2)) From 68f90d2a7c0c4f2b3c08db7f393e4efb1dd834b0 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Fri, 8 Jan 2016 15:45:45 -0800 Subject: [PATCH 13/19] Fix lifted index sort --- SNAPLib/GenomeIndex.cpp | 85 +++++++++++++++++------- SNAPLib/IntersectingPairedEndAligner.cpp | 46 ++++++++----- tests/alttestgen.py | 40 +++++++---- 3 files changed, 116 insertions(+), 55 deletions(-) diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 4ae13022..a257c3c0 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -747,6 +747,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla delete [] histogram; } + if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) { + // create a sub-index with only seeds that occur in alt contigs + // need to build lifted index here because it will reorder unlifted index overflow table + snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); + bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, + hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); + if (!ok) { + WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); + soft_exit(1); + return false; + } + } + // // Now save out the part of the index that's independent of the genome itself. // @@ -805,18 +818,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla fclose(indexFile); - if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) { - // create a sub-index with only seeds that occur in alt contigs - snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); - bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, - hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); - if (!ok) { - WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer); - soft_exit(1); - return false; - } - } - index->genome = NULL; // deleted earlier delete index; if (computeBias && biasTable != NULL) { @@ -1388,22 +1389,23 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa CHECK_ALTS_AND_ADD_LIFTED } } - +#undef CHECK_ALTS_AND_ADD_LIFTED + void -dualSort32( +dualBackwardsSort32( _int64 n, unsigned* keys, unsigned* values) { // todo: optimize sorting, just using a simple selection sort for now unsigned t; -#define DUAL_SORT \ +#define DUAL_BACKWARDS_SORT \ if (n < 2) { \ return; \ } \ for (_int64 i = 0; i < n - 1; i++) { \ for (_int64 j = n - 1; j > i; j--) { \ - if (keys[i] > keys[j]) { \ + if (keys[i] < keys[j]) { \ t = keys[i]; \ keys[i] = keys[j]; \ keys[j] = t; \ @@ -1413,18 +1415,19 @@ dualSort32( } \ } \ } - DUAL_SORT + DUAL_BACKWARDS_SORT } void -dualSort( +dualBackwardsSort( _int64 n, GenomeLocation* keys, GenomeLocation* values) { GenomeLocation t; - DUAL_SORT + DUAL_BACKWARDS_SORT } +#undef DUAL_BACKWARDS_SORT void GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large) @@ -1443,9 +1446,26 @@ GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashT if (nLiftedHits > 1 || nLiftedRCHits > 1) { context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); - if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) { - dualSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits); - dualSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits); + if ((nHits > 1 && genomeLocation == hits[0]) || (nRCHits > 1 && genomeLocation == rcHits[0])) { + // re-lift unlifted so that the order corresponds, then sort both by lifted location + for (int i = 0; i < nHits; i++) { + ((GenomeLocation*)liftedHits)[i] = genome->getLiftedLocation(hits[i]); + } + for (int i = 0; i < nRCHits; i++) { + ((GenomeLocation*)liftedRCHits)[i] = genome->getLiftedLocation(rcHits[i]); + } + dualBackwardsSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits); + dualBackwardsSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits); +#ifdef _DEBUG + for (int i = 0; i < nHits; i++) { + _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]); + _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]); + } + for (int i = 0; i < nRCHits; i++) { + _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]); + _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]); + } +#endif } } } else { @@ -1456,8 +1476,25 @@ GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashT context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits); if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) { - dualSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits); - dualSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits); + // re-lift unlifted so that the order corresponds, then sort both by lifted location + for (int i = 0; i < nHits; i++) { + ((unsigned*)liftedHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(hits[i])); + } + for (int i = 0; i < nRCHits; i++) { + ((unsigned*)liftedRCHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(rcHits[i])); + } + dualBackwardsSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits); + dualBackwardsSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits); +#ifdef _DEBUG + for (int i = 0; i < nHits; i++) { + _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]); + _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]); + } + for (int i = 0; i < nRCHits; i++) { + _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]); + _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]); + } +#endif } } } diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index c1cc00ec..92f73883 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -186,7 +186,7 @@ IntersectingPairedEndAligner::align( #ifdef _DEBUG if (_DumpAlignments) { - printf("\nIntersectingAligner aligning reads '%*.s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData()); + printf("\nIntersectingAligner aligning reads '%.*s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData()); } #endif // _DEBUG @@ -434,8 +434,8 @@ IntersectingPairedEndAligner::align( GenomeLocation lastGenomeLocationForReadWithMoreHits; GenomeLocation lastUnliftedGenomeLocationForReadWithFewerHits; GenomeLocation lastUnliftedGenomeLocationForReadWithMoreHits; - GenomeLocation *pLastGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL; - GenomeLocation *pLastGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL; + GenomeLocation *pLastUnliftedGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL; + GenomeLocation *pLastUnliftedGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL; unsigned lastSeedOffsetForReadWithMoreHits; bool outOfMoreHitsLocations = false; @@ -443,13 +443,13 @@ IntersectingPairedEndAligner::align( // // Seed the intersection state by doing a first lookup. // - if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { + if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { // // No hits in this direction. // continue; // The outer loop over set pairs. } - + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); lastGenomeLocationForReadWithMoreHits = InvalidGenomeLocation; // @@ -471,9 +471,10 @@ IntersectingPairedEndAligner::align( // location that's not too high. // if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing, - &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) { + &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) { break; // End of all of the mates. We're done with this set pair. } + _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits); } if ((lastGenomeLocationForReadWithMoreHits + maxSpacing < lastGenomeLocationForReadWithFewerHits || outOfMoreHitsLocations) && @@ -490,12 +491,13 @@ IntersectingPairedEndAligner::align( } if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits, - &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { + &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { // // No more candidates on the read with fewer hits side. We're done with this set pair. // break; } + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); continue; } @@ -522,8 +524,8 @@ IntersectingPairedEndAligner::align( #ifdef _DEBUG if (_DumpAlignments) { - printf("SetPair %d, added more hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n", - whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits, + printf("SetPair %d, added more hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n", + whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits, lastUnliftedGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits); } @@ -533,11 +535,12 @@ IntersectingPairedEndAligner::align( previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits; - if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) { + if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) { lastGenomeLocationForReadWithMoreHits = 0; outOfMoreHitsLocations = true; break; // out of the loop looking for candidates on the more hits side. } + _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits); } // @@ -586,8 +589,8 @@ IntersectingPairedEndAligner::align( #ifdef _DEBUG if (_DumpAlignments) { - printf("SetPair %d, added fewer hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n", - whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits, + printf("SetPair %d, added fewer hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n", + whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits, lastUnliftedGenomeLocationForReadWithFewerHits, lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits, lastSeedOffsetForReadWithFewerHits); } @@ -597,9 +600,10 @@ IntersectingPairedEndAligner::align( maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore); } - if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) { + if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) { break; } + _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits); } } // For each set pair @@ -638,8 +642,8 @@ IntersectingPairedEndAligner::align( #ifdef _DEBUG if (_DumpAlignments) { - printf("Scored fewer end candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool), - candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->seedOffset, + printf("Scored fewer end candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool), + candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset, scoreLimit, fewerEndScore, fewerEndGenomeLocationOffset); } #endif // DEBUG @@ -669,8 +673,8 @@ IntersectingPairedEndAligner::align( &mate->genomeOffset); #ifdef _DEBUG if (_DumpAlignments) { - printf("Scored mate candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", - (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation, + printf("Scored mate candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n", + (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation, mate->readWithMoreHitsUnliftedGenomeLocation, mate->seedOffset, scoreLimit - fewerEndScore, mate->score, mate->genomeOffset); } #endif // _DEBUG @@ -862,9 +866,10 @@ IntersectingPairedEndAligner::align( } #ifdef _DEBUG if (_DumpAlignments) { - printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n", + printf("Added %e (= %e * %e) @ (%u, %u)((%u, %u)), giving new probability of all pairs %e, score %d = %d + %d%s\n", pairProbability, mate->matchProbability , fewerEndMatchProbability, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, + candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsUnliftedGenomeLocation+ mate->genomeOffset, probabilityOfAllPairs, pairScore, fewerEndScore, mate->score, isBestHit ? " New best hit" : ""); } @@ -1476,6 +1481,11 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL matchProbability = newMatchProbability; pairScore = newPairScore; *oldMatchProbability = 0.0; +#ifdef _DEBUG + if (_DumpAlignments) { + printf("New anchor loc (%u, %u)\n", newMoreHitLocation, newFewerHitLocation); + } +#endif return false; } else { // diff --git a/tests/alttestgen.py b/tests/alttestgen.py index 20640293..082e6830 100644 --- a/tests/alttestgen.py +++ b/tests/alttestgen.py @@ -68,20 +68,20 @@ def __init__(self, contigs={}): def add(self, contig): self.contigs[contig.name] = contig - def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05): + def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.01): pc = self.contigs[parent] altseq = random_mutate(pc.seq[start:stop], pmut) if (isRC): altseq = rc(altseq) self.add(Contig(name, accession, altseq, True, parent, start, isRC)) - def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None): + def make_read(self, chr, pos, isReverse=False, len=100, pmut=.01, id=None): if id == None: id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f')) seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut) return Read(id, chr, pos, seq) - def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): + def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.01): id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1) r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") @@ -100,7 +100,10 @@ def to_sam_pair(self, read1, read2): def write_fasta(self, filename): with open(filename, 'w') as file: for write_alts in [False, True]: - for contig in self.contigs.values(): + cnames = self.contigs.keys() + cnames.sort() + for cname in cnames: + contig = self.contigs[cname] if contig.isAlt == write_alts: file.write(">{}|gb|{}\n".format(contig.name, contig.accession)) LINE_LEN=100 @@ -117,18 +120,29 @@ def write_alts(self, filename): 1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0)) g = Genome() -g.add(Contig("chr1", "C01", random_bases(5000))) +seq = random_bases(7000) +seq = seq + random_mutate(seq[5000:6000]) + random_bases(1000) + random_mutate(seq[5000:6000]) + random_bases(1000) +g.add(Contig("chr1", "C01", seq)) g.add_alt("chr1a", "C01A", "chr1", 1000, 2000) g.add_alt("chr1b", "C01B", "chr1", 3000, 4000, True) +g.add_alt("chr1c", "C01C", "chr1", 5000, 6000) +g.add_alt("chr1d", "C01D", "chr1", 7000, 8000, True) +g.add_alt("chr1e", "C01E", "chr1", 9000, 10000) +g.add_alt("chr1f", "C01F", "chr1", 9000, 10000) g.write_fasta("test.fa") g.write_alts("test_alts.txt") with open("test.sam", "w") as file: - for i in [100, 150, 200, 250, 2100, 2150, 2200, 2250]: - if i < 2000: - [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i) - else: - [r1, r2] = g.make_pair('chr1', i, 'chr1b' , 900 - (i - 2000)) - file.write(g.to_sam_pair(r1,r2)) - [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000) - file.write(g.to_sam_pair(r1,r2)) + for i in [0, 100, 600, 800, 900]: + for j in range(5): + start = j * 2000 + chralt = ['chr1a', 'chr1b', 'chr1c', 'chr1d', 'chr1e'][j] + ialt = 900 - i if g.contigs[chralt].isAltRC else i + [r1, r2] = g.make_pair('chr1', i + start, chralt, ialt) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair('chr1', i + start, 'chr1', i + start + 1000) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair(chralt, ialt, 'chr1', i + start + 2000) + file.write(g.to_sam_pair(r1,r2)) + [r1, r2] = g.make_pair('chr1', i + start + 1000, 'chr1', i + start + 2000) + file.write(g.to_sam_pair(r1,r2)) From 6a4bfd8ff07b6202d66daf0cd7b38ab7773c096d Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Mon, 11 Jan 2016 13:21:34 -0800 Subject: [PATCH 14/19] Compile on Linux --- SNAPLib/GenomeIndex.cpp | 34 +++++++++++++++++++++++----------- tests/alttestgen.py | 8 +++----- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index a257c3c0..08825b83 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -540,6 +540,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000); } + // declare variables before goto + _uint64 nBackpointersProcessed; + _int64 lastPrintTime; + const unsigned maxHistogramEntry = 500000; + _uint64 countOfTooBigForHistogram; + _uint64 sumOfTooBigForHistogram; + _uint64 largestSeed; + unsigned *histogram; + FILE *tablesFile; + size_t totalBytesWritten; + _uint64 overflowTableIndex; + _uint64 duplicateSeedsProcessed; + if (unliftedIndex != NULL && liftedIndexPass == 1) { goto lifted_skip_overflow; } @@ -571,14 +584,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla soft_exit(1); } - _uint64 nBackpointersProcessed = 0; - _int64 lastPrintTime = timeInMillis(); + nBackpointersProcessed = 0; + lastPrintTime = timeInMillis(); - const unsigned maxHistogramEntry = 500000; - _uint64 countOfTooBigForHistogram = 0; - _uint64 sumOfTooBigForHistogram = 0; - _uint64 largestSeed = 0; - unsigned *histogram = NULL; + countOfTooBigForHistogram = 0; + sumOfTooBigForHistogram = 0; + largestSeed = 0; + histogram = NULL; if (buildHistogram) { histogram = new unsigned[maxHistogramEntry+1]; for (unsigned i = 0; i <= maxHistogramEntry; i++) { @@ -591,15 +603,15 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // Write the hash tables as we go so that we can free their memory on the fly. // snprintf(filenameBuffer,filenameBufferSize,"%s%c%s", directoryName, PATH_SEP, GenomeIndexHashFileName); - FILE *tablesFile = fopen(filenameBuffer, "wb"); + tablesFile = fopen(filenameBuffer, "wb"); if (NULL == tablesFile) { WriteErrorMessage("Unable to open hash table file '%s'\n", filenameBuffer); soft_exit(1); } - size_t totalBytesWritten = 0; - _uint64 overflowTableIndex = 0; - _uint64 duplicateSeedsProcessed = 0; + totalBytesWritten = 0; + overflowTableIndex = 0; + duplicateSeedsProcessed = 0; for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) { if (NULL == hashTables[whichHashTable]) { diff --git a/tests/alttestgen.py b/tests/alttestgen.py index 082e6830..f92bc700 100644 --- a/tests/alttestgen.py +++ b/tests/alttestgen.py @@ -11,8 +11,6 @@ import subprocess import random -import pandas as pd - BASES = "ACTG" RCBASES = {"A":"T", "T":"A", "C":"G", "G":"C"} @@ -68,20 +66,20 @@ def __init__(self, contigs={}): def add(self, contig): self.contigs[contig.name] = contig - def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.01): + def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.02): pc = self.contigs[parent] altseq = random_mutate(pc.seq[start:stop], pmut) if (isRC): altseq = rc(altseq) self.add(Contig(name, accession, altseq, True, parent, start, isRC)) - def make_read(self, chr, pos, isReverse=False, len=100, pmut=.01, id=None): + def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None): if id == None: id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f')) seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut) return Read(id, chr, pos, seq) - def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.01): + def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02): id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1) r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1") r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2") From e0df34d43302c55f3f5a944647859cac0754ea8c Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Tue, 12 Jan 2016 10:52:08 -0800 Subject: [PATCH 15/19] Calculate bias table for lifted index --- SNAPLib/GenomeIndex.cpp | 108 ++++++++++++++++++++++++++++++---------- SNAPLib/GenomeIndex.h | 13 +++-- 2 files changed, 89 insertions(+), 32 deletions(-) diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 08825b83..43b4563f 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -337,6 +337,9 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // Compute bias table sizes, unless we're using the precomputed ones hardcoded in BiasTables.cpp double *biasTable = NULL; + if (unliftedIndex != NULL) { + computeBias = true; + } if (!computeBias) { if (large) { biasTable = hg19_biasTables_large[hashTableKeySize][seedLen]; @@ -353,7 +356,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla if (computeBias) { unsigned nHashTables = 1 << ((max((unsigned)seedLen, hashTableKeySize * 4) - hashTableKeySize * 4) * 2); biasTable = new double[nHashTables]; - ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large); + ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large, unliftedIndex); } WriteStatusMessage("Allocating memory for hash tables..."); @@ -980,7 +983,7 @@ GenomeIndex::~GenomeIndex() } void -GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large) +GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex) /** * Fill in table with the table size biases for a given genome and seed size. * We assume that table is already of the correct size for our seed size @@ -1053,12 +1056,14 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables); - - if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { - _uint64 value = 42; - seedsSeen->Insert(seed.getBases(), &value); - numExactSeeds[seed.getHighBases(hashTableKeySize)]++; - } + _int64 nHits, nRCHits; + if (unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex, &nHits, &nRCHits)) { + if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { + _uint64 value = 42; + seedsSeen->Insert(seed.getBases(), &value); + numExactSeeds[seed.getHighBases(hashTableKeySize)]++; + } + } } // for (unsigned i = 0; i < nHashTables; i++) printf("Hash table %d is predicted to have %lld entries\n", i, numExactSeeds[i]); @@ -1102,6 +1107,8 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, contexts[i].validSeeds = &validSeeds; contexts[i].approximateCounterLocks = locks; contexts[i].large = large; + contexts[i].unliftedIndex = unliftedIndex; + StartNewThread(ComputeBiasTableWorkerThreadMain, &contexts[i]); } @@ -1206,19 +1213,30 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) _ASSERT(whichHashTable < context->nHashTables); - if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { - PerCounterBatch *batch = &batches[whichHashTable]; - AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - batch->apply(&(*context->approxCounters)[whichHashTable]); - ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + _int64 nRepeats = 1; + if (context->unliftedIndex != NULL) { + _int64 nHits, nRCHits; + if (hasAnyAltHits(seed, i, context->unliftedIndex, &nHits, &nRCHits)) { + nRepeats = nHits + nRCHits; + } else { + nRepeats = 0; + } + } + for (; nRepeats > 0; nRepeats--) { + if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { + PerCounterBatch *batch = &batches[whichHashTable]; + AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + batch->apply(&(*context->approxCounters)[whichHashTable]); + ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); + _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); - if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds)/printBatchSize) { - WriteStatusMessage("Bias computation: %lld / %lld\n",(basesProcessed/printBatchSize)*printBatchSize, (_int64)countOfBases); - } - unrecordedSkippedSeeds= 0; // We've now recorded them. - } + if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) { + WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases); + } + unrecordedSkippedSeeds = 0; // We've now recorded them. + } + } } for (unsigned i = 0; i < context->nHashTables; i++) { @@ -1246,7 +1264,43 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) } } - +bool +GenomeIndex::hasAnyAltHits( + Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits) +{ + if (unliftedIndex == NULL) { + return false; + } + _int64 nHits, nRCHits; + if (unliftedIndex->doesGenomeIndexHave64BitLocations()) { + const GenomeLocation *hits, *rcHits; + GenomeLocation singleHit[2], singleRCHit[2]; + unliftedIndex->lookupSeed(seed, pnHits, &hits, pnRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); + *pnHits = nHits; + *pnRCHits = nRCHits; +#define HAS_ANY_ALTS \ + if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ + (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ + for (int i = 0; i < nHits; i++) { \ + if (unliftedIndex->genome->isAltLocation(hits[i])) { \ + return true; \ + } \ + } \ + for (int i = 0; i < nRCHits; i++) { \ + if (unliftedIndex->genome->isAltLocation(hits[i])) { \ + return true; \ + } \ + } \ + return false; \ + } + HAS_ANY_ALTS + } else { + const unsigned *hits, *rcHits; + unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + HAS_ANY_ALTS + } +} +#undef HAS_ANY_ALTS void GenomeIndex::BuildHashTablesWorkerThreadMain(void *param) @@ -1375,11 +1429,11 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ bool anyAlts = false; \ - for (int i = 0; i < nHits && ! anyAlts; i++) { \ - anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \ + for (int i = 0; i < nHits && !anyAlts; i++) { \ + anyAlts = genome->isAltLocation(hits[i]); \ } \ for (int i = 0; i < nRCHits && !anyAlts; i++) { \ - anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \ + anyAlts = genome->isAltLocation(rcHits[i]); \ } \ if (anyAlts) { \ for (int i = 0; i < nHits; i++) { \ @@ -2128,7 +2182,7 @@ GenomeIndex::lookupSeed32( _int64 *nHits, const unsigned **hits, _int64 *nRCHits, - const unsigned **rcHits) + const unsigned **rcHits) const { _ASSERT(locationSize == 4); // This is the caller's responsibility to check. @@ -2216,7 +2270,7 @@ GenomeIndex::lookupSeedAlt32( GenomeIndex::fillInLookedUpResults32( const unsigned *subEntry, _int64 *nHits, - const unsigned **hits) + const unsigned **hits) const { // // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at @@ -2265,7 +2319,7 @@ GenomeIndex::lookupSeed( _int64 * nRCHits, const GenomeLocation ** rcHits, GenomeLocation * singleHit, - GenomeLocation * singleRCHit) + GenomeLocation * singleRCHit) const { _ASSERT(locationSize > 4 && locationSize <= 8); @@ -2367,7 +2421,7 @@ GenomeIndex::lookupSeedAlt( } void -GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) +GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const { // // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index c0a45840..c0897a79 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -47,8 +47,8 @@ class GenomeIndex { // be pointed to as a return value. When only a single hit is returned, *hits == singleHit, so there's // no need to check on the caller's side. // - void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit); - void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits); + void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit) const; + void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits) const; // versions for genome that has alt regions // hits/rcHits locations are lifted to non-alt contigs, unliftedHits/unliftedRCHits are original locations in alt contigs @@ -185,7 +185,7 @@ class GenomeIndex { static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1]; static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1]; - static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large); + static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex = NULL); struct ComputeBiasTableThreadContext { SingleWaiterObject *doneObject; @@ -200,12 +200,15 @@ class GenomeIndex { unsigned seedLen; volatile _int64 *validSeeds; bool large; + const GenomeIndex *unliftedIndex; ExclusiveLock *approximateCounterLocks; }; static void ComputeBiasTableWorkerThreadMain(void *param); + static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits); + struct OverflowBackpointer; struct BuildHashTablesThreadContext { @@ -316,6 +319,6 @@ class GenomeIndex { BuildHashTablesThreadContext*context, GenomeLocation genomeLocation); - void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits); - void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation); + void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits) const; + void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const; }; From ea14df263644f305530272cf36de0eb5f0ca888c Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Tue, 12 Jan 2016 14:49:04 -0800 Subject: [PATCH 16/19] Cleanup status printing for alt index build --- SNAPLib/GenomeIndex.cpp | 43 ++++++++++++++++++++++------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 43b4563f..637bf9b9 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -382,14 +382,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla start = timeInMillis(); volatile _int64 nextOverflowBackpointer = 0; - volatile _int64 nonSeeds = 0; - volatile _int64 seedsWithMultipleOccurrences = 0; - volatile _int64 genomeLocationsInOverflowTable = 0; // Number of extra hits on duplicate indices. This should come out once we implement the overflow table. - volatile _int64 bothComplementsUsed = 0; // Number of hash buckets where both complements are used - volatile _int64 noBaseAvailable = 0; // Number of places where getSubstring returned null. - volatile _int64 nBasesProcessed = 0; - volatile int runningThreadCount; - unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads); BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads]; @@ -403,6 +395,14 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla lifted_index_pass_start: + volatile _int64 nonSeeds = 0; + volatile _int64 seedsWithMultipleOccurrences = 0; + volatile _int64 genomeLocationsInOverflowTable = 0; // Number of extra hits on duplicate indices. This should come out once we implement the overflow table. + volatile _int64 bothComplementsUsed = 0; // Number of hash buckets where both complements are used + volatile _int64 noBaseAvailable = 0; // Number of places where getSubstring returned null. + volatile _int64 nBasesProcessed = 0; + volatile int runningThreadCount; + SingleWaiterObject doneObject; CreateSingleWaiterObject(&doneObject); @@ -489,18 +489,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla // (_int64)hashTables[j]->GetUsedElementCount() * 100 / (_int64)hashTables[j]->GetTableSize()); } - WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n", - seedsWithMultipleOccurrences, - (seedsWithMultipleOccurrences * 100) / countOfBases, - genomeLocationsInOverflowTable, - genomeLocationsInOverflowTable * 100 / countOfBases, - nonSeeds, - (nonSeeds * 100) / countOfBases, - bothComplementsUsed, - noBaseAvailable); - - WriteStatusMessage("Hash table build took %llds\n",(timeInMillis() + 500 - start) / 1000); - + if (unliftedIndex == NULL) { + WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n", + seedsWithMultipleOccurrences, + (seedsWithMultipleOccurrences * 100) / countOfBases, + genomeLocationsInOverflowTable, + genomeLocationsInOverflowTable * 100 / countOfBases, + nonSeeds, + (nonSeeds * 100) / countOfBases, + bothComplementsUsed, + noBaseAvailable); + + WriteStatusMessage("Hash table build took %llds\n", (timeInMillis() + 500 - start) / 1000); + } // // We're done with the raw genome. Delete it to save some memory. // @@ -765,6 +766,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) { // create a sub-index with only seeds that occur in alt contigs // need to build lifted index here because it will reorder unlifted index overflow table + WriteStatusMessage("Creating sub-index for alt contigs...\n"); snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName); bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact, hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index); @@ -773,6 +775,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla soft_exit(1); return false; } + WriteStatusMessage("...Finished creating sub-index for alt contigs\n"); } // From 8d12f71421b2630d2b47b1e88ed4ce06becefbc7 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Tue, 12 Jan 2016 15:16:49 -0800 Subject: [PATCH 17/19] Pre-alloc alt match info --- SNAPLib/BaseAligner.cpp | 25 ++++++++++----------- SNAPLib/BaseAligner.h | 2 +- SNAPLib/VariableSizeVector.h | 42 +++++++++++++++++++++++++----------- 3 files changed, 44 insertions(+), 25 deletions(-) diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp index cd6b72eb..febad5f6 100644 --- a/SNAPLib/BaseAligner.cpp +++ b/SNAPLib/BaseAligner.cpp @@ -227,8 +227,8 @@ Routine Description: hashTableEpoch = 0; if (genome->hasAltContigs()) { - // todo: BigAlloc / new(allocator) -> fixed size, avoid reallocs; reserve space for max size - allMatches = new MatchInfoVector(); + MatchInfo* entries = (MatchInfo*)allocator->allocate(sizeof(MatchInfo)* maxHitsToConsider); + allMatches = new MatchInfoVector(entries, maxHitsToConsider); } } @@ -1503,17 +1503,18 @@ BaseAligner::getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishki } return - contigCounters + - sizeof(_uint64) * 14 + // allow for alignment - sizeof(BaseAligner) + // our own member variables + contigCounters + + sizeof(_uint64)* 14 + // allow for alignment + sizeof(BaseAligner)+ // our own member variables (ownLandauVishkin ? - LandauVishkin<>::getBigAllocatorReservation() + - LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects - sizeof(char) * maxReadSize * 2 + // rcReadData - sizeof(char) * maxReadSize * 4 + 2 * MAX_K + // reversed read (both) - sizeof(BYTE) * (maxReadSize + 7 + 128) / 8 + // seed used - sizeof(HashTableElement) * hashTableElementPoolSize + // hash table element pool - sizeof(HashTableAnchor) * candidateHashTablesSize * 2 + // candidate hash table (both) + LandauVishkin<>::getBigAllocatorReservation() + + LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects + sizeof(char)* maxReadSize * 2 + // rcReadData + sizeof(char)* maxReadSize * 4 + 2 * MAX_K + // reversed read (both) + sizeof(BYTE)* (maxReadSize + 7 + 128) / 8 + // seed used + sizeof(HashTableElement)* hashTableElementPoolSize + // hash table element pool + sizeof(HashTableAnchor)* candidateHashTablesSize * 2 + // candidate hash table (both) + (index->getGenome()->hasAltContigs() ? sizeof(MatchInfo) * maxHitsToConsider : 0) + // matches for alt contigs sizeof(HashTableElement) * (maxSeedsToUse + 1); // weight lists } diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h index ee369a7c..5a876c04 100644 --- a/SNAPLib/BaseAligner.h +++ b/SNAPLib/BaseAligner.h @@ -342,7 +342,7 @@ class BaseAligner { return a.liftedLocation < b.liftedLocation; } - typedef VariableSizeVector MatchInfoVector; + typedef VariableSizeVector MatchInfoVector; MatchInfoVector* allMatches; diff --git a/SNAPLib/VariableSizeVector.h b/SNAPLib/VariableSizeVector.h index e32d3dc2..f1b21bc7 100644 --- a/SNAPLib/VariableSizeVector.h +++ b/SNAPLib/VariableSizeVector.h @@ -4,6 +4,7 @@ // // A variable-size vector that does not perform any memory allocation except to grow. +// if grow==0 then it must be supplied with a vector to start and it will NOT grow or deallocate // template class VariableSizeVector @@ -17,31 +18,43 @@ class VariableSizeVector WriteErrorMessage("%s: allocate %lld - consider using BigAlloc\n", __FUNCTION__, bytes); } #endif + _ASSERT(grow != 0); return big ? BigAlloc(bytes) : malloc(bytes); } inline static void deallocate(void* p) { - if (big) { BigDealloc(p); } else { free(p); } + if (grow != 0) { + if (big) { BigDealloc(p); } else { free(p); } + } } public: + VariableSizeVector(int i_capacity = 16) : entries(NULL), count(0), capacity(i_capacity) - {} + { _ASSERT(grow != 0); } - VariableSizeVector(VariableSizeVector& other) + VariableSizeVector(V* i_entries, int i_capacity) + : entries(i_entries), capacity(i_capacity), count(0) + { _ASSERT(grow == 0); } + + VariableSizeVector(VariableSizeVector& other) : entries(other.entries), count(other.count), capacity(other.capacity) { other.count = 0; + other.capacity = 0; other.entries = NULL; } ~VariableSizeVector() { if (entries != NULL) { - deallocate(entries); + if (grow != 0) { + deallocate(entries); + } entries = NULL; + capacity = 0; count = 0; } } @@ -57,7 +70,7 @@ class VariableSizeVector } public: - void operator=(VariableSizeVector& other) + void operator=(VariableSizeVector& other) { entries = other.entries; capacity = other.capacity; @@ -68,6 +81,13 @@ class VariableSizeVector void reserve(_int64 newCapacity) { + if (grow == 0) { + if (newCapacity <= capacity) { + return; + } + WriteErrorMessage("Unable to grow fixed VariableSizeVector from %ld to %ld\n", capacity, newCapacity); + soft_exit(1); + } _ASSERT(newCapacity >= 0); if (newCapacity <= capacity && entries != NULL) { return; @@ -89,8 +109,10 @@ class VariableSizeVector inline void clean() { if (entries != NULL) { - deallocate(entries); - entries = NULL; + if (grow != 0) { + deallocate(entries); + entries = NULL; + } count = 0; } } @@ -109,11 +131,7 @@ class VariableSizeVector inline void push_back(V& value) { - if (entries == NULL) { - reserve(capacity); - } else if (count == capacity) { - reserve((int) (((_int64) count * grow) / 100)); - } + increase(); _ASSERT(count < capacity); entries[count++] = value; } From fa17fff4104b2679aceaaf14e0fb6c5676fc92af Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Thu, 14 Jan 2016 09:32:27 -0800 Subject: [PATCH 18/19] Add parsing options for v38 ref --- SNAPLib/FASTA.cpp | 111 ++++++++++++++++++----- SNAPLib/FASTA.h | 6 +- SNAPLib/Genome.cpp | 31 +++---- SNAPLib/Genome.h | 2 +- SNAPLib/GenomeIndex.cpp | 88 ++++++++++-------- SNAPLib/GenomeIndex.h | 2 +- SNAPLib/IntersectingPairedEndAligner.cpp | 17 ++-- SNAPLib/IntersectingPairedEndAligner.h | 1 - 8 files changed, 165 insertions(+), 93 deletions(-) diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp index 659bc990..05ae845d 100644 --- a/SNAPLib/FASTA.cpp +++ b/SNAPLib/FASTA.cpp @@ -37,6 +37,8 @@ ReadFASTAGenome( const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, + const char *chrTag, + const char *chrMapFilename, AltContigMap* altMap) { // @@ -54,15 +56,38 @@ ReadFASTAGenome( isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true; isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true; + int lineBufferSize = 0; + char *lineBuffer; + + map chrMap; + if (chrMapFilename != NULL) { + FILE* mapFile = fopen(chrMapFilename, "r"); + if (mapFile == NULL) { + WriteErrorMessage("Unable to open -chrmap file '%s'\n", chrMapFilename); + return NULL; + } + while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, mapFile)) { + if (lineBuffer[0] == '#') { + continue; + } + string chrom; + for (char * token = strtok(lineBuffer, "\t\r\n"); token != NULL; token = strtok(NULL, "\t\r\n")) { + if (token == lineBuffer) { + chrom = string(token); + } else { + chrMap[string(token)] = chrom; + } + } + } + fclose(mapFile); + } + FILE *fastaFile = fopen(fileName, "r"); if (fastaFile == NULL) { WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName); return NULL; } - int lineBufferSize = 0; - char *lineBuffer; - // // Count the chromosomes // @@ -97,39 +122,59 @@ ReadFASTAGenome( // // Now supply the chromosome name. // - char * terminator = lineBuffer + strlen(lineBuffer); - char * p; - if (NULL != pieceNameTerminatorCharacters) { - for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { - p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); + const char *chrName; + int chrNameLen; + if (chrTag == NULL) { + char * terminator = lineBuffer + strlen(lineBuffer); + char * p; + if (NULL != pieceNameTerminatorCharacters) { + for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) { + p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]); + if (NULL != p && p < terminator) { + terminator = p; + } + } + } + if (spaceIsAPieceNameTerminator) { + p = strchr(lineBuffer, ' '); + if (NULL != p && p < terminator) { + terminator = p; + } + p = strchr(lineBuffer, '\t'); if (NULL != p && p < terminator) { terminator = p; } } - } - if (spaceIsAPieceNameTerminator) { - p = strchr(lineBuffer, ' '); + p = strchr(lineBuffer, '\n'); if (NULL != p && p < terminator) { terminator = p; } - p = strchr(lineBuffer, '\t'); + p = strchr(lineBuffer, '\r'); if (NULL != p && p < terminator) { terminator = p; } - } - p = strchr(lineBuffer, '\n'); - if (NULL != p && p < terminator) { - terminator = p; - } - p = strchr(lineBuffer, '\r'); - if (NULL != p && p < terminator) { - terminator = p; + chrName = lineBuffer + 1; + chrNameLen = (int) (terminator - lineBuffer - 1); + } else { + if (!FindFASTATagValue(lineBuffer, chrTag, &chrName, &chrNameLen)) { + WriteErrorMessage("Unable to find tag '%s' in contig '%s'\n", chrTag, lineBuffer + 1); + soft_exit(1); + } + if (chrMapFilename != NULL) { + map::iterator mapped = chrMap.find(string(chrName, chrName + chrNameLen)); + if (mapped != chrMap.end()) { + chrName = mapped->second.data(); + chrNameLen = (int) mapped->second.length(); + } + } } if (altMap != NULL) { - altMap->addFastaContig(lineBuffer, terminator); + altMap->addFastaContig(lineBuffer, chrName, chrNameLen); } - *terminator = '\0'; - genome->startContig(lineBuffer+1, altMap); + char *contigName = (char*) malloc(chrNameLen + 1); + memcpy(contigName, chrName, chrNameLen); + contigName[chrNameLen] = '\0'; + genome->startContig(contigName, altMap); } else { if (!inAContig) { WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n"); @@ -208,3 +253,23 @@ bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="") } return !ferror(fasta); } + + bool +FindFASTATagValue(const char* lineBuffer, const char* tagName, const char ** pTagValue, int * pValueLength) +{ + const char *tag = lineBuffer; + do { + tag = strstr(tag + 1, tagName); + if (tag == NULL) { + return false; + } + } while (tag[-1] != '>' && tag[-1] != '|' && tag[strlen(tagName)] != '|'); + *pTagValue = tag + strlen(tagName) + 1; // Format is "tag|value| + const char *tagValueEnd = strchr(*pTagValue, '|'); + if (tagValueEnd == NULL) { + WriteErrorMessage("Badly formatted tag '%s' in contig '%s'\n", tag, lineBuffer + 1); + soft_exit(1); + } + *pValueLength = (int) (tagValueEnd - *pTagValue); + return true; +} diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h index cda9a1f8..f3c0e3d3 100644 --- a/SNAPLib/FASTA.h +++ b/SNAPLib/FASTA.h @@ -27,7 +27,7 @@ Revision History: #include "Genome.h" const Genome * -ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, AltContigMap* altMap); +ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, const char* chrTag, const char* chrMapFilename, AltContigMap* altMap); // // The FASTA appending functions return whether the write was successful. @@ -39,3 +39,7 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool AppendFASTAGenome(const Genome *, FILE *fasta); + +// utility for parsing FASTA tags + bool +FindFASTATagValue(const char* lineBuffer, const char* tag, const char ** pTagValue, int * pValueLength); diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp index 8a6a4fb4..490393b5 100755 --- a/SNAPLib/Genome.cpp +++ b/SNAPLib/Genome.cpp @@ -24,6 +24,7 @@ Revision History: #include "stdafx.h" #include "Genome.h" +#include "FASTA.h" #include "GenericFile.h" #include "GenericFile_map.h" #include "Compat.h" @@ -33,6 +34,9 @@ Revision History: #include "Util.h" #include "VariableSizeVector.h" +#include +using namespace std; + Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs) : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs), mappedFile(NULL), minAltLocation(i_maxBases) @@ -545,7 +549,7 @@ GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const if (altLocation < minAltLocation) { return altLocation; } - const Contig* alt = getContigAtLocation(altLocation); + const Contig* alt = getContigAtLocation(altLocation + chromosomePadding / 2); if (alt == NULL || ! alt->isAlternate) { return altLocation; } @@ -655,9 +659,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum soft_exit(1); } *q = '\0'; - char * tag = (char*) malloc(q - p + 2); + char * tag = (char*) malloc(q - p + 1); strcpy(tag, p); - strcat(tag, "|"); result->accessionFastaTag = tag; // get names for each column type (last 2 are optional) @@ -760,34 +763,26 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum return result; } -void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTerminator) +void AltContigMap::addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength) { // get the name - char* name = (char*) malloc(nameTerminator - lineBuffer); - memcpy(name, lineBuffer + 1, nameTerminator - lineBuffer - 1); - name[nameTerminator - lineBuffer - 1] = 0; + string name(chrName, chrName + chrNameLength); // find the accession number - const char* tag = strstr(lineBuffer, accessionFastaTag); - const char* p = tag + strlen(accessionFastaTag); - if (tag == NULL || *p == '\0') { + const char *accessionStart; + int accessionLength; + if (!FindFASTATagValue(lineBuffer, accessionFastaTag, &accessionStart, &accessionLength)) { WriteErrorMessage("Unable to find accession code for contig %s in FASTA line\n%s\n", name, lineBuffer); soft_exit(1); } - const char*q = p; - while (*q != '\0' && *q != '|' && *q != ' ' && *q != '\t' && *q != '\r' && *q != '\n') { - q++; - } - char* accession = (char*)malloc(q - p); - memcpy(accession, p, q - p); - *(accession + (q - p)) = '\0'; + string accession(accessionStart, accessionStart + accessionLength); nameToAccession[name] = accession; accessionToName[accession] = name; StringAltContigMap::iterator alt = altsByAccession.find(accession); if (alt != altsByAccession.end()) { - alt->second.name = name; + alt->second.name = (new string(name))->data(); // alloc & never free, but tiny :-) } } diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h index c01a3b61..f1d285e5 100644 --- a/SNAPLib/Genome.h +++ b/SNAPLib/Genome.h @@ -336,7 +336,7 @@ class AltContigMap static AltContigMap* readFromFile(const char* filename, const char* columnList); - void addFastaContig(const char* lineBuffer, const char* terminator); + void addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength); void setAltContig(Genome::Contig* contig); diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 637bf9b9..12f86df4 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -46,7 +46,7 @@ static const double DEFAULT_SLACK = 0.3; static const unsigned DEFAULT_PADDING = 500; static const unsigned DEFAULT_KEY_BYTES = 4; static const unsigned DEFAULT_LOCATION_SIZE = 4; -static const char* DEFAULT_ALT_COLUMNS = "gb,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail"; +static const char* DEFAULT_ALT_COLUMNS = "ref,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail"; const char *GenomeIndexFileName = "GenomeIndex"; const char *OverflowTableFileName = "OverflowTable"; const char *GenomeIndexHashFileName = "GenomeIndexHash"; @@ -89,6 +89,8 @@ static void usage() "-altmap file Tab-separated file of alt contig mapping information\n" "-altcols columns Comma-separated list of columns describing alt mapping file\n" " Default is v38 %s\n" + "-chrtag tag Tag for chrom name\n" + "-chrmap file Tab-separated file of chrom name and tag values\n" , DEFAULT_SEED_SIZE, DEFAULT_SLACK, @@ -128,6 +130,8 @@ GenomeIndex::runIndexer( bool smallMemory = false; const char* altMapFilename = NULL; const char* altMapColumns = DEFAULT_ALT_COLUMNS; + const char* chrTag = NULL; + const char* chrMapFilename = NULL; for (int n = 2; n < argc; n++) { if (strcmp(argv[n], "-s") == 0) { @@ -208,12 +212,33 @@ GenomeIndex::runIndexer( } else { usage(); } + } else if (!strcmp(argv[n], "-chrtag")) { + if (n + 1 < argc) { + chrTag = argv[n + 1]; + n++; + } + else { + usage(); + } + } else if (!strcmp(argv[n], "-chrmap")) { + if (n + 1 < argc) { + chrMapFilename = argv[n + 1]; + n++; + } + else { + usage(); + } } else { WriteErrorMessage("Invalid argument: %s\n\n", argv[n]); usage(); } } + if (chrMapFilename != NULL && chrTag == NULL) { + WriteErrorMessage("The -chrmap option requires the -chrtag option to be specified\n"); + usage(); + } + if (seedLen < 16 || seedLen > 32) { // Seeds are stored in 64 bits, so they can't be larger than 32 bases for now. WriteErrorMessage("Seed length must be between 16 and 32, inclusive\n"); @@ -246,7 +271,7 @@ GenomeIndex::runIndexer( AltContigMap* altMap = altMapFilename != NULL ? AltContigMap::readFromFile(altMapFilename, altMapColumns) : NULL; - const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, altMap); + const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, chrTag, chrMapFilename, altMap); if (NULL == genome) { WriteErrorMessage("Unable to read FASTA file\n"); soft_exit(1); @@ -1059,13 +1084,12 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables); - _int64 nHits, nRCHits; - if (unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex, &nHits, &nRCHits)) { - if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { - _uint64 value = 42; - seedsSeen->Insert(seed.getBases(), &value); - numExactSeeds[seed.getHighBases(hashTableKeySize)]++; - } + bool addSeed = unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex) || + ((!large) && hasAnyAltHits(~seed, i, unliftedIndex)); + if (addSeed && NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { + _uint64 value = 42; + seedsSeen->Insert(seed.getBases(), &value); + numExactSeeds[seed.getHighBases(hashTableKeySize)]++; } } @@ -1216,29 +1240,20 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) _ASSERT(whichHashTable < context->nHashTables); - _int64 nRepeats = 1; - if (context->unliftedIndex != NULL) { - _int64 nHits, nRCHits; - if (hasAnyAltHits(seed, i, context->unliftedIndex, &nHits, &nRCHits)) { - nRepeats = nHits + nRCHits; - } else { - nRepeats = 0; - } - } - for (; nRepeats > 0; nRepeats--) { - if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { - PerCounterBatch *batch = &batches[whichHashTable]; - AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - batch->apply(&(*context->approxCounters)[whichHashTable]); - ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + bool addSeed = context->unliftedIndex == NULL || hasAnyAltHits(seed, i, context->unliftedIndex) || + ((!large) && hasAnyAltHits(~seed, i, context->unliftedIndex)); + if (addSeed && batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { + PerCounterBatch *batch = &batches[whichHashTable]; + AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); + batch->apply(&(*context->approxCounters)[whichHashTable]); + ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]); - _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); + _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds); - if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) { - WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases); - } - unrecordedSkippedSeeds = 0; // We've now recorded them. + if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) { + WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases); } + unrecordedSkippedSeeds = 0; // We've now recorded them. } } @@ -1269,18 +1284,13 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) bool GenomeIndex::hasAnyAltHits( - Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits) + Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex) { - if (unliftedIndex == NULL) { - return false; - } _int64 nHits, nRCHits; if (unliftedIndex->doesGenomeIndexHave64BitLocations()) { const GenomeLocation *hits, *rcHits; GenomeLocation singleHit[2], singleRCHit[2]; - unliftedIndex->lookupSeed(seed, pnHits, &hits, pnRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); - *pnHits = nHits; - *pnRCHits = nRCHits; + unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); #define HAS_ANY_ALTS \ if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ @@ -1290,12 +1300,12 @@ GenomeIndex::hasAnyAltHits( } \ } \ for (int i = 0; i < nRCHits; i++) { \ - if (unliftedIndex->genome->isAltLocation(hits[i])) { \ + if (unliftedIndex->genome->isAltLocation(rcHits[i])) { \ return true; \ } \ } \ - return false; \ - } + } \ + return false; HAS_ANY_ALTS } else { const unsigned *hits, *rcHits; diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index c0897a79..0bcaad1a 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -207,7 +207,7 @@ class GenomeIndex { static void ComputeBiasTableWorkerThreadMain(void *param); - static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits); + static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex); struct OverflowBackpointer; diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp index 92f73883..1c5087df 100644 --- a/SNAPLib/IntersectingPairedEndAligner.cpp +++ b/SNAPLib/IntersectingPairedEndAligner.cpp @@ -534,7 +534,6 @@ IntersectingPairedEndAligner::align( lowestFreeScoringMateCandidate[whichSetPair]++; previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits; - if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) { lastGenomeLocationForReadWithMoreHits = 0; outOfMoreHitsLocations = true; @@ -1385,14 +1384,14 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren } bool - IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) +IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit( + GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation) { // // Look through all of the lookups and find the one with the highest location smaller than the current one. // GenomeLocation foundLocation = 0; bool anyFound = false; - const bool setUnlifted = unliftedGenomeLocation != NULL; // // Run through the lookups pushing up any that are at the most recently returned @@ -1401,8 +1400,8 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren for (unsigned i = 0; i < nLookupsUsed; i++) { _int64 *currentHitForIntersection; _int64 nHits; - GenomeLocation hitLocation; - GenomeLocation unliftedHitLocation; + GenomeLocation hitLocation = -1; + GenomeLocation unliftedHitLocation = -2; unsigned seedOffset; // @@ -1414,7 +1413,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren seedOffset = lookups[i].seedOffset; \ if (nHits != *currentHitForIntersection) { \ hitLocation = lookups[i].hits[*currentHitForIntersection]; \ - if (setUnlifted) { \ + if (unliftedGenomeLocation != NULL) { \ unliftedHitLocation = lookups[i].unliftedHits[*currentHitForIntersection]; \ } \ } @@ -1436,12 +1435,12 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren } if (doesGenomeIndexHave64BitLocations) { hitLocation = lookups64[i].hits[*currentHitForIntersection]; - if (setUnlifted) { + if (unliftedGenomeLocation != NULL) { unliftedHitLocation = lookups64[i].unliftedHits[*currentHitForIntersection]; } } else { hitLocation = lookups32[i].hits[*currentHitForIntersection]; - if (setUnlifted) { + if (unliftedGenomeLocation != NULL) { unliftedHitLocation = lookups32[i].unliftedHits[*currentHitForIntersection]; } } @@ -1452,7 +1451,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome { *genomeLocation = foundLocation = hitLocation - seedOffset; - if (setUnlifted) { + if (unliftedGenomeLocation != NULL) { *unliftedGenomeLocation = unliftedHitLocation - seedOffset; } *seedOffsetFound = seedOffset; diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h index 39c258ef..3884892d 100644 --- a/SNAPLib/IntersectingPairedEndAligner.h +++ b/SNAPLib/IntersectingPairedEndAligner.h @@ -229,7 +229,6 @@ class IntersectingPairedEndAligner : public PairedEndAligner // bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation); - // // Find the highest genome address. // From 919b95962e2e1f52228ea5489147286465037e95 Mon Sep 17 00:00:00 2001 From: Ravi Pandya Date: Thu, 14 Jan 2016 10:42:45 -0800 Subject: [PATCH 19/19] Fix rc sort, non-large alt index build --- SNAPLib/GenomeIndex.cpp | 29 +++++++++++++++-------------- SNAPLib/GenomeIndex.h | 2 +- SNAPLib/SortedDataWriter.cpp | 10 ++++++++++ 3 files changed, 26 insertions(+), 15 deletions(-) diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp index 12f86df4..06fb64d4 100644 --- a/SNAPLib/GenomeIndex.cpp +++ b/SNAPLib/GenomeIndex.cpp @@ -1084,8 +1084,7 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, _ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables); - bool addSeed = unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex) || - ((!large) && hasAnyAltHits(~seed, i, unliftedIndex)); + bool addSeed = unliftedIndex == NULL || unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large); if (addSeed && NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) { _uint64 value = 42; seedsSeen->Insert(seed.getBases(), &value); @@ -1240,8 +1239,7 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) _ASSERT(whichHashTable < context->nHashTables); - bool addSeed = context->unliftedIndex == NULL || hasAnyAltHits(seed, i, context->unliftedIndex) || - ((!large) && hasAnyAltHits(~seed, i, context->unliftedIndex)); + bool addSeed = context->unliftedIndex == NULL || context->unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large); if (addSeed && batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) { PerCounterBatch *batch = &batches[whichHashTable]; AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]); @@ -1283,25 +1281,28 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param) } bool -GenomeIndex::hasAnyAltHits( - Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex) +GenomeIndex::hasAnyAltHitsAndLocationIsFirst( + Seed seed, GenomeLocation genomeLocation, bool large) const { _int64 nHits, nRCHits; - if (unliftedIndex->doesGenomeIndexHave64BitLocations()) { + if (doesGenomeIndexHave64BitLocations()) { const GenomeLocation *hits, *rcHits; GenomeLocation singleHit[2], singleRCHit[2]; - unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); + lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]); #define HAS_ANY_ALTS \ - if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ - (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \ + bool isFirst = large \ + ? (nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \ + (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits)) \ + : nHits > 0 && genomeLocation == *hits; \ + if (isFirst) { \ for (int i = 0; i < nHits; i++) { \ - if (unliftedIndex->genome->isAltLocation(hits[i])) { \ + if (genome->isAltLocation(hits[i])) { \ return true; \ } \ } \ for (int i = 0; i < nRCHits; i++) { \ - if (unliftedIndex->genome->isAltLocation(rcHits[i])) { \ - return true; \ + if (genome->isAltLocation(rcHits[i])) { \ + return true; \ } \ } \ } \ @@ -1309,7 +1310,7 @@ GenomeIndex::hasAnyAltHits( HAS_ANY_ALTS } else { const unsigned *hits, *rcHits; - unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); + lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits); HAS_ANY_ALTS } } diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h index 0bcaad1a..86a28c4f 100644 --- a/SNAPLib/GenomeIndex.h +++ b/SNAPLib/GenomeIndex.h @@ -207,7 +207,7 @@ class GenomeIndex { static void ComputeBiasTableWorkerThreadMain(void *param); - static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex); + bool hasAnyAltHitsAndLocationIsFirst(Seed seed, GenomeLocation genomeLocation, bool large) const; struct OverflowBackpointer; diff --git a/SNAPLib/SortedDataWriter.cpp b/SNAPLib/SortedDataWriter.cpp index 85ad336b..2d8970fd 100644 --- a/SNAPLib/SortedDataWriter.cpp +++ b/SNAPLib/SortedDataWriter.cpp @@ -184,6 +184,16 @@ SortedDataFilter::onAdvance( GenomeDistance bytes, GenomeLocation location) { + if (location != InvalidGenomeLocation && parent->genome->hasAltContigs()) { + // reads mapped to RC alt contigs need to have location flipped so they sort properly + const Genome::Contig* c = parent->genome->getContigAtLocation(location); + if (c != NULL && c->isAlternateRC) { + GenomeLocation rcLocation; + GenomeDistance ignore; + parent->format->getSortInfo(parent->genome, data, bytes, &rcLocation, &ignore); + location = rcLocation; + } + } SortEntry entry(batchOffset, bytes, location); #ifdef VALIDATE_SORT if (memcmp(data, "BAM", 3) != 0 && memcmp(data, "@HD", 3) != 0) { // skip header block