From 8e4786b411c7641555ca907844ad075571a8e2b0 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Tue, 1 Dec 2015 20:16:28 -0800
Subject: [PATCH 01/19] Alt mapping - initial sketch in single & paired aligner
 - WIP

---
 SNAPLib/BaseAligner.cpp                  |  69 +++++++++++++-
 SNAPLib/FASTA.h                          |   8 --
 SNAPLib/Genome.h                         |  12 ++-
 SNAPLib/GenomeIndex.cpp                  |  33 +++++++
 SNAPLib/GenomeIndex.h                    |   7 ++
 SNAPLib/IntersectingPairedEndAligner.cpp | 112 ++++++++++++++++-------
 SNAPLib/IntersectingPairedEndAligner.h   |  37 ++++++--
 7 files changed, 228 insertions(+), 50 deletions(-)
diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
index eb9fb4cb..02efa6c0 100644
--- a/SNAPLib/BaseAligner.cpp
+++ b/SNAPLib/BaseAligner.cpp
@@ -44,6 +44,27 @@ using std::min;
 #define TRACE(...) {}
 #endif
 
+
+typedef struct MatchInfo
+{
+    GenomeLocation  location;
+    GenomeLocation  liftedLocation;
+    double          matchProbability;
+
+    MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) :
+        location(_loc), liftedLocation(_lifted), matchProbability(_p) {}
+} MatchInfo;
+
+bool
+matchInfoComparator(
+    const MatchInfo& a,
+    const MatchInfo& b)
+{
+    return a.liftedLocation < b.liftedLocation;
+}
+
+typedef VariableSizeVector<MatchInfo> MatchInfoVector;
+
 BaseAligner::BaseAligner(
     GenomeIndex    *i_genomeIndex,
     unsigned        i_maxHitsToConsider,
@@ -652,6 +673,37 @@ Return Value:
     return;
 }
 
+  /**
+    * Add up the highest-probability matches of all overlapping alternates
+    */
+    double
+computeLiftedCandidateProbability(
+    MatchInfoVector* allMatches,
+    GenomeDistance length)
+{
+    std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator);
+    double totalProbability = 0.0;
+    MatchInfo best(0, 0, 0);
+    GenomeLocation farthest;
+    for (int i = 0; i <= allMatches->size(); i++) {
+        MatchInfo m(0, 0, 0);
+        if (i == allMatches->size() || (m = (*allMatches)[i]).liftedLocation > farthest) {
+            totalProbability += best.matchProbability;
+            best = m;
+        }
+        else {
+            if (m.matchProbability > best.matchProbability) {
+                best = m;
+            }
+            GenomeLocation e = m.liftedLocation + length - 1;
+            if (e > farthest) {
+                farthest = e;
+            }
+        }
+    }
+    return totalProbability;
+}
+
     bool
 BaseAligner::score(
         bool                     forceResult,
@@ -744,6 +796,11 @@ Return Value:
 #endif
 
     unsigned weightListToCheck = highestUsedWeightList;
+    MatchInfoVector* allMatches = NULL;
+    bool anyAltMatches = FALSE;
+    if (genome->hasAltContigs()) {
+        allMatches = new MatchInfoVector();
+    }
 
     do {
         //
@@ -764,6 +821,9 @@ Return Value:
                 primaryResult->score = bestScore;
                 if (bestScore <= maxK) {
                     primaryResult->location = bestScoreGenomeLocation;
+                    if (anyAltMatches) {
+                        probabilityOfAllCandidates = computeLiftedCandidateProbability(allMatches, read[0]->getDataLength());
+                    }
                     primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped);
                     if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) {
                         primaryResult->status = SingleHit;
@@ -913,6 +973,14 @@ Return Value:
                             // We could mark as scored anything in between the old and new genome offsets, but it's probably not worth the effort since this is
                             // so rare and all it would do is same time.
                             //
+
+                            // remember in case there are alt matches
+                            if (allMatches != NULL) {
+                                if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) {
+                                    anyAltMatches = TRUE;
+                                }
+                                allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability));
+                            }
                         }
                     }
                 } else { // if we had genome data to compare against
@@ -1114,7 +1182,6 @@ Return Value:
     return false;
 }
 
-
     void
 BaseAligner::prefetchHashTableBucket(GenomeLocation genomeLocation, Direction direction)
 {
diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h
index 6e542f32..44cb810f 100644
--- a/SNAPLib/FASTA.h
+++ b/SNAPLib/FASTA.h
@@ -39,11 +39,3 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters,
 
     bool
 AppendFASTAGenome(const Genome *, FILE *fasta);
-
-//
-// This is arbitrary; is there some existing convention?
-//
-inline const char *diploidFASTASexPrefix(bool male)
-{
-    return male ? "PATERNAL|" : "MATERNAL|";
-}
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index 84e94f52..d65d9008 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -245,9 +245,15 @@ class Genome {
         }
 
         struct Contig {
-            Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL) {}
+            Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL),
+                    isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {}
             GenomeLocation     beginningLocation;
             GenomeDistance     length;
+            bool               isAlternate;
+            int                altGroup; // each group of overlapping alt regions is given a unique ID
+            bool               isReverseStrand; // if reversed alternate strand
+            GenomeLocation     liftedLocation; // location of beginning of alt contig mapping to primary
+            GenomeLocation     contextBefore, contextAfter;   // context sequence added from primary (alts near ends have less context)
             unsigned           nameLength;
             char              *name;
         };
@@ -261,6 +267,10 @@ class Genome {
         const Contig *getNextContigAfterLocation(GenomeLocation location) const;
         int getContigNumAtLocation(GenomeLocation location) const;    // Returns the contig number, which runs from 0 .. getNumContigs() - 1.
 
+        inline bool hasAltContigs() const { return FALSE;  } // todo: implement
+
+        GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation;  } // todo: implement
+
 // unused        Genome *copy() const {return copy(true,true,true);}
 // unused        Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);}
 
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 2f34ba82..72f05bd6 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -1915,6 +1915,22 @@ GenomeIndex::lookupSeed32(
     }
 }
 
+
+    void
+GenomeIndex::lookupSeedAlt32(
+        Seed              seed,
+        _int64           *nHits,
+        const unsigned  **hits,
+        _int64           *nRCHits,
+        const unsigned  **rcHits,
+        const unsigned  **unliftedHits,
+        const unsigned  **unliftedRCHits)
+{
+    lookupSeed32(seed, nHits, hits, nRCHits, rcHits);
+    *unliftedHits = *hits;
+    *unliftedRCHits = *rcHits;
+}
+
     void
 GenomeIndex::fillInLookedUpResults32(
     const unsigned  *subEntry,
@@ -2041,6 +2057,23 @@ GenomeIndex::lookupSeed(
     }
 }
 
+    void
+GenomeIndex::lookupSeedAlt(
+        Seed                    seed,
+        _int64 *                nHits,
+        const GenomeLocation ** hits,
+        _int64 *                nRCHits,
+        const GenomeLocation ** rcHits,
+        const GenomeLocation ** unliftedHits,
+        const GenomeLocation ** unliftedRCHits,
+        GenomeLocation *        singleHit,
+        GenomeLocation *        singleRCHit)
+{
+    // todo: implement
+    lookupSeed(seed, nHits, hits, nRCHits, rcHits, singleHit, singleRCHit);
+    *unliftedHits = *hits;
+    *unliftedRCHits = *rcHits;
+}
 
     void 
 GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation)
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index 21654618..2cdd0782 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -50,6 +50,13 @@ class GenomeIndex {
     void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
     void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits);
 
+    // versions for genome that has alt regions
+    // hits/rcHits locations are lifted to non-alt contigs, unliftedHits/unliftedRCHits are original locations in alt contigs
+    // nHits/nRCHits is the same for both sets
+    // *hits==*unliftedHits && *rcHits==*unliftedRCHits iff seed has no alt hits
+    void lookupSeedAlt(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, const GenomeLocation **unliftedHits, const GenomeLocation **unliftedRCHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
+    void lookupSeedAlt32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits, const unsigned **unliftedHits, const unsigned **unliftedRCHits);
+
     bool doesGenomeIndexHave64BitLocations() const {return locationSize > 4;}
 
     //
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index be94b4ec..a0d1c759 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -335,12 +335,26 @@ IntersectingPairedEndAligner::align(
             _int64 nHits[NUM_DIRECTIONS];
             const GenomeLocation *hits[NUM_DIRECTIONS];
             const unsigned *hits32[NUM_DIRECTIONS];
+            const GenomeLocation *unliftedHits[NUM_DIRECTIONS];
+            const unsigned *unliftedHits32[NUM_DIRECTIONS];
 
-            if (doesGenomeIndexHave64BitLocations) {
-                index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], 
-                            hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
-            } else {
-                index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]);
+            if (!doesGenomeIndexHaveAlts) {
+                if (doesGenomeIndexHave64BitLocations) {
+                    index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC],
+                        hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
+                }
+                else {
+                    index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]);
+                }
+            }
+            else {
+                if (doesGenomeIndexHave64BitLocations) {
+                    index->lookupSeedAlt(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &unliftedHits[FORWARD], &unliftedHits[RC],
+                        hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
+                }
+                else {
+                    index->lookupSeedAlt32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC], &unliftedHits32[FORWARD], &unliftedHits32[RC]);
+                }
             }
 
             countOfHashTableLookups[whichRead]++;
@@ -353,10 +367,21 @@ IntersectingPairedEndAligner::align(
                 }
                 if (nHits[dir] < maxBigHits) {
                     totalHashTableHits[whichRead][dir] += nHits[dir];
-                    if (doesGenomeIndexHave64BitLocations) {
-                        hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], beginsDisjointHitSet[dir]);
-                    } else {
-                        hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], beginsDisjointHitSet[dir]);
+                    if (!doesGenomeIndexHaveAlts) {
+                        if (doesGenomeIndexHave64BitLocations) {
+                            hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], NULL, beginsDisjointHitSet[dir]);
+                        }
+                        else {
+                            hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], NULL, beginsDisjointHitSet[dir]);
+                        }
+                    }
+                    else {
+                        if (doesGenomeIndexHave64BitLocations) {
+                            hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], unliftedHits[dir], beginsDisjointHitSet[dir]);
+                        }
+                        else {
+                            hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], unliftedHits32[dir], beginsDisjointHitSet[dir]);
+                        }
                     }
                     beginsDisjointHitSet[dir]= false;
                 } else {
@@ -388,7 +413,6 @@ IntersectingPairedEndAligner::align(
 
     Direction setPairDirection[NUM_SET_PAIRS][NUM_READS_PER_PAIR] = {{FORWARD, RC}, {RC, FORWARD}};
 
-
     //
     // Phase 2: find all possible candidates and add them to candidate lists (for the reads with fewer and more hits).
     //
@@ -409,6 +433,10 @@ IntersectingPairedEndAligner::align(
         unsigned            lastSeedOffsetForReadWithFewerHits;
         GenomeLocation      lastGenomeLocationForReadWithFewerHits;
         GenomeLocation      lastGenomeLocationForReadWithMoreHits;
+        GenomeLocation      lastUnliftedGenomeLocationForReadWithFewerHits;
+        GenomeLocation      lastUnliftedGenomeLocationForReadWithMoreHits;
+        GenomeLocation     *pLastGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL;
+        GenomeLocation     *pLastGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL;
         unsigned            lastSeedOffsetForReadWithMoreHits;
 
         bool                outOfMoreHitsLocations = false;
@@ -416,7 +444,7 @@ IntersectingPairedEndAligner::align(
         //
         // Seed the intersection state by doing a first lookup.
         //
-        if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) {
+        if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
             //
             // No hits in this direction.
             //
@@ -444,7 +472,7 @@ IntersectingPairedEndAligner::align(
                 // location that's not too high.
                 //
                 if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing,
-                                                                             &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) {
+                    &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) {
                     break;  // End of all of the mates.  We're done with this set pair.
                 }
             }
@@ -463,7 +491,7 @@ IntersectingPairedEndAligner::align(
                 }
 
                 if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits,
-                                                        &lastSeedOffsetForReadWithFewerHits)) {
+                    &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
                     //
                     // No more candidates on the read with fewer hits side.  We're done with this set pair.
                     //
@@ -490,7 +518,8 @@ IntersectingPairedEndAligner::align(
                     soft_exit(1);
                 }
                 scoringMateCandidates[whichSetPair][lowestFreeScoringMateCandidate[whichSetPair]].init(
-                                lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits);
+                                lastGenomeLocationForReadWithMoreHits, bestPossibleScoreForReadWithMoreHits, lastSeedOffsetForReadWithMoreHits,
+                                doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithMoreHits : lastGenomeLocationForReadWithMoreHits);
 
 #ifdef _DEBUG
                 if (_DumpAlignments) {
@@ -505,7 +534,7 @@ IntersectingPairedEndAligner::align(
 
                 previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits;
 
-                if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits)) {
+                if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) {
                     lastGenomeLocationForReadWithMoreHits = 0;
                     outOfMoreHitsLocations = true;
                     break; // out of the loop looking for candidates on the more hits side.
@@ -550,7 +579,8 @@ IntersectingPairedEndAligner::align(
 
                 scoringCandidatePool[lowestFreeScoringCandidatePoolEntry].init(lastGenomeLocationForReadWithFewerHits, whichSetPair, lowestFreeScoringMateCandidate[whichSetPair] - 1,
                                                                                 lastSeedOffsetForReadWithFewerHits, bestPossibleScoreForReadWithFewerHits,
-                                                                                scoringCandidates[bestPossibleScore]);
+                                                                                scoringCandidates[bestPossibleScore],
+                                                                                doesGenomeIndexHaveAlts ? lastUnliftedGenomeLocationForReadWithFewerHits : lastGenomeLocationForReadWithFewerHits);
 
 
                 scoringCandidates[bestPossibleScore] = &scoringCandidatePool[lowestFreeScoringCandidatePoolEntry];
@@ -568,7 +598,7 @@ IntersectingPairedEndAligner::align(
                 maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore);
             }
 
-            if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits)) {
+            if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
                 break;
             }
         }
@@ -602,7 +632,7 @@ IntersectingPairedEndAligner::align(
         double fewerEndMatchProbability;
         int fewerEndGenomeLocationOffset;
 
-        scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsGenomeLocation,
+        scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation,
             candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset);
 
         _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
@@ -635,7 +665,7 @@ IntersectingPairedEndAligner::align(
                     // use now, score it.
                     //
                     if (mate->score == -2 || mate->score == -1 && mate->scoreLimit < scoreLimit - fewerEndScore) {
-                        scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsGenomeLocation,
+                        scoreLocation(readWithMoreHits, setPairDirection[candidate->whichSetPair][readWithMoreHits], mate->readWithMoreHitsUnliftedGenomeLocation,
                             mate->seedOffset, scoreLimit - fewerEndScore, &mate->score, &mate->matchProbability,
                             &mate->genomeOffset);
 #ifdef _DEBUG
@@ -654,6 +684,16 @@ IntersectingPairedEndAligner::align(
                     if (mate->score != -1) {
                         double pairProbability = mate->matchProbability * fewerEndMatchProbability;
                         unsigned pairScore = mate->score + fewerEndScore;
+
+                        // reduce probability of pairs matching across different overlapping alts
+                        // todo: assuming if they're on different alts within maxSpacing they overlap - true for GRCh38 but not necessarily for all genomes
+                        // use crossover probability with 1 centiMorgan ~= 1Mbp
+                        if (doesGenomeIndexHaveAlts && isBothAltPairMapping(candidate, mate) &&
+                            abs(mate->readWithMoreHitsUnliftedGenomeLocation - candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) 
+                        {
+                            pairProbability *= 1e-8 * abs(candidate->readWithFewerHitsGenomeLocation - mate->readWithMoreHitsGenomeLocation);
+                        }
+
                         //
                         // See if this should be ignored as a merge, or if we need to back out a previously scored location
                         // because it's a worse version of this location.
@@ -713,7 +753,7 @@ IntersectingPairedEndAligner::align(
                             candidate->mergeAnchor = mergeAnchor;
                         } else {
                             merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset,
-                                pairProbability, pairScore, &oldPairProbability);
+                                pairProbability, pairScore, doesGenomeIndexHaveAlts && isNonAltPairMapping(candidate, mate), &oldPairProbability);
                         }
 
                         if (!merged) {
@@ -727,7 +767,8 @@ IntersectingPairedEndAligner::align(
                             bool isBestHit = false;
 
                             if (pairScore <= maxK && (pairScore < bestPairScore ||
-                                (pairScore == bestPairScore && pairProbability > probabilityOfBestPair))) {
+                                (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair ||
+                                (pairProbability == probabilityOfBestPair && isNonAltPairMapping(candidate, mate)))))) {
                                 //
                                 // A new best hit.
                                 //
@@ -759,8 +800,8 @@ IntersectingPairedEndAligner::align(
                                 }
                                 bestPairScore = pairScore;
                                 probabilityOfBestPair = pairProbability;
-                                bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset;
-                                bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset;
+                                bestResultGenomeLocation[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset;
+                                bestResultGenomeLocation[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset;
                                 bestResultScore[readWithFewerHits] = fewerEndScore;
                                 bestResultScore[readWithMoreHits] = mate->score;
                                 bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits];
@@ -786,8 +827,8 @@ IntersectingPairedEndAligner::align(
                                     result->direction[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits];
                                     result->direction[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits];
                                     result->fromAlignTogether = true;
-                                    result->location[readWithMoreHits] = mate->readWithMoreHitsGenomeLocation + mate->genomeOffset;
-                                    result->location[readWithFewerHits] = candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset;
+                                    result->location[readWithMoreHits] = mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset;
+                                    result->location[readWithFewerHits] = candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset;
                                     result->mapq[0] = result->mapq[1] = 0;
                                     result->score[readWithMoreHits] = mate->score;
                                     result->score[readWithFewerHits] = fewerEndScore;
@@ -1091,7 +1132,7 @@ IntersectingPairedEndAligner::HashTableHitSet::init()
 
 #define RL(lookups, glType, lookupListHead)                                                                                                                 \
     void                                                                                                                                                    \
-IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, bool beginsDisjointHitSet)               \
+IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset, _int64 nHits, const glType *hits, const glType *unliftedHits, bool beginsDisjointHitSet)               \
 {                                                                                                                                                           \
     _ASSERT(nLookupsUsed < maxSeeds);                                                                                                                       \
     if (beginsDisjointHitSet) {                                                                                                                             \
@@ -1106,6 +1147,7 @@ IntersectingPairedEndAligner::HashTableHitSet::recordLookup(unsigned seedOffset,
         _ASSERT(currentDisjointHitSet != -1);    /* Essentially that beginsDisjointHitSet is set for the first recordLookup call */                         \
         lookups[nLookupsUsed].currentHitForIntersection = 0;                                                                                                \
         lookups[nLookupsUsed].hits = hits;                                                                                                                  \
+        lookups[nLookupsUsed].unliftedHits = unliftedHits;                                                                                                  \
         lookups[nLookupsUsed].nHits = nHits;                                                                                                                \
         lookups[nLookupsUsed].seedOffset = seedOffset;                                                                                                      \
         lookups[nLookupsUsed].whichDisjointHitSet = currentDisjointHitSet;                                                                                  \
@@ -1181,7 +1223,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
 }
 
 	bool
-IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound)
+        IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound)
 {
 
     bool anyFound = false;
@@ -1238,6 +1280,10 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom
                 if (probeHit - seedOffset > bestLocationFound) {
 					anyFound = true;
                     mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
+                    if (actualUnliftedGenomeLocationFound != NULL) {
+                        *actualUnliftedGenomeLocationFound = doesGenomeIndexHave64BitLocations
+                            ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe];
+                    }
                     *seedOffsetFound = seedOffset;
                 }
 
@@ -1273,7 +1319,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextHitLessThanOrEqualTo(Genom
 
 
     bool
-IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound)
+        IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation)
 {
     bool anyFound = false;
     *genomeLocation = 0;
@@ -1286,6 +1332,9 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom
     for (unsigned i = 0; i < nLookupsUsed; i++) {                                                                                           \
         if (lookups[i].nHits > 0 && lookups[i].hits[0] - lookups[i].seedOffset > GenomeLocationAsInt64(*genomeLocation)) {                  \
             mostRecentLocationReturned = *genomeLocation = lookups[i].hits[0] - lookups[i].seedOffset;                                      \
+            if (unliftedGenomeLocation != NULL) {                                                                                           \
+                *unliftedGenomeLocation = lookups[i].unliftedHits[0] - lookups[i].seedOffset;                                               \
+            }                                                                                                                               \
             *seedOffsetFound = lookups[i].seedOffset;                                                                                       \
             anyFound = true;                                                                                                                \
         }                                                                                                                                   \
@@ -1303,7 +1352,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getFirstHit(GenomeLocation *genom
 }
 
     bool
-IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound)
+        IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation)
 {
     //
     // Look through all of the lookups and find the one with the highest location smaller than the current one.
@@ -1373,7 +1422,7 @@ IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *g
 }
 
             bool
-IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore,
+IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt,
                         double *oldMatchProbability)
 {
     if (locationForReadWithMoreHits == InvalidGenomeLocation || !doesRangeMatch(newMoreHitLocation, newFewerHitLocation)) {
@@ -1390,7 +1439,8 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL
         //
         // Within merge distance.  Keep the better score (or if they're tied the better match probability).
         //
-        if (newPairScore < pairScore || newPairScore == pairScore && newMatchProbability > matchProbability) {
+        if (newPairScore < pairScore || (newPairScore == pairScore &&
+            (newMatchProbability > matchProbability || (newMatchProbability == matchProbability && newPairIsNonAlt)))) {
 #ifdef _DEBUG
             if (_DumpAlignments) {
                 printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n",
diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h
index 9bc6029e..e951f64b 100644
--- a/SNAPLib/IntersectingPairedEndAligner.h
+++ b/SNAPLib/IntersectingPairedEndAligner.h
@@ -134,6 +134,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
     unsigned        maxSpacing;
     unsigned        seedLen;
     bool            doesGenomeIndexHave64BitLocations;
+    bool            doesGenomeIndexHaveAlts;
     _int64          nLocationsScored;
     bool            noUkkonen;
     bool            noOrderedEvaluation;
@@ -149,6 +150,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         unsigned        seedOffset;
         _int64          nHits;
         const GL  *     hits;
+        const GL  *     unliftedHits;
         unsigned        whichDisjointHitSet;
 
         //
@@ -185,7 +187,8 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         // provide the lookup function a place to write the result.  Since we need one per
         // lookup, it goes here.
         //
-        GL singletonGenomeLocation[2];  // The [2] is because we need to look one before sometimes, and that allows space
+        GL singletonGenomeLocation[4];  // The [4] is because we need to look one before sometimes, and that allows space
+                                        // also to allow space for unlifted locations
     };
     
     //
@@ -211,26 +214,26 @@ class IntersectingPairedEndAligner : public PairedEndAligner
 		// seed for it not to hit, and since the reads are disjoint there can't be a case
 		// where the same difference caused two seeds to miss).
         //
-        void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, bool beginsDisjointHitSet);
-        void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, bool beginsDisjointHitSet);
+        void recordLookup(unsigned seedOffset, _int64 nHits, const unsigned *hits, const unsigned *unliftedHits, bool beginsDisjointHitSet);
+        void recordLookup(unsigned seedOffset, _int64 nHits, const GenomeLocation *hits, const GenomeLocation *unliftedHits, bool beginsDisjointHitSet);
 
         //
         // This efficiently works through the set looking for the next hit at or below this address.
         // A HashTableHitSet only allows a single iteration through its address space per call to
         // init().
         //
-        bool    getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound);
+        bool    getNextHitLessThanOrEqualTo(GenomeLocation maxGenomeLocationToFind, GenomeLocation *actualGenomeLocationFound, unsigned *seedOffsetFound, GenomeLocation *actualUnliftedGenomeLocationFound);
 
         //
         // Walk down just one step, don't binary search.
         //
-        bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound);
+        bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation);
 
 
         //
         // Find the highest genome address.
         //
-        bool    getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound);
+        bool    getFirstHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation);
 
 		unsigned computeBestPossibleScoreForCurrentHit();
 
@@ -377,7 +380,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         //
         // Returns true and sets oldMatchProbability if this should be eliminated due to a match.
         //
-        bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, 
+        bool checkMerge(GenomeLocation newMoreHitLocation, GenomeLocation newFewerHitLocation, double newMatchProbability, int newPairScore, bool newPairIsNonAlt,
                         double *oldMatchProbability); 
     };
 
@@ -394,14 +397,16 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         //
         double                  matchProbability;
         GenomeLocation          readWithMoreHitsGenomeLocation;
+        GenomeLocation          readWithMoreHitsUnliftedGenomeLocation;
         unsigned                bestPossibleScore;
         unsigned                score;
         unsigned                scoreLimit;             // The scoreLimit with which score was computed
         unsigned                seedOffset;
         int                     genomeOffset;
 
-        void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_) {
+        void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_, GenomeLocation readWithMoreHitsUnliftedGenomeLocation_) {
             readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_;
+            readWithMoreHitsUnliftedGenomeLocation = readWithMoreHitsUnliftedGenomeLocation_;
             bestPossibleScore = bestPossibleScore_;
             seedOffset = seedOffset_;
             score = -2;
@@ -416,15 +421,17 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         MergeAnchor *           mergeAnchor;
         unsigned                scoringMateCandidateIndex;  // Index into the array of scoring mate candidates where we should look 
         GenomeLocation          readWithFewerHitsGenomeLocation;
+        GenomeLocation          readWithFewerHitsUnliftedGenomeLocation;
         unsigned                whichSetPair;
         unsigned                seedOffset;
 
         unsigned                bestPossibleScore;
 
         void init(GenomeLocation readWithFewerHitsGenomeLocation_, unsigned whichSetPair_, unsigned scoringMateCandidateIndex_, unsigned seedOffset_,
-                  unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_)
+                  unsigned bestPossibleScore_, ScoringCandidate *scoreListNext_, GenomeLocation readWithFewerHitsUnliftedGenomeLocation_)
         {
             readWithFewerHitsGenomeLocation = readWithFewerHitsGenomeLocation_;
+            readWithFewerHitsUnliftedGenomeLocation = readWithFewerHitsUnliftedGenomeLocation_;
             whichSetPair = whichSetPair_;
             _ASSERT(whichSetPair < NUM_SET_PAIRS);  // You wouldn't think this would be necessary, but...
             scoringMateCandidateIndex = scoringMateCandidateIndex_;
@@ -435,6 +442,18 @@ class IntersectingPairedEndAligner : public PairedEndAligner
          }
     };
 
+    static bool isNonAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate)
+    {
+        return candidate->readWithFewerHitsGenomeLocation == candidate->readWithFewerHitsUnliftedGenomeLocation &&
+            mate->readWithMoreHitsGenomeLocation == mate->readWithMoreHitsUnliftedGenomeLocation;
+    }
+
+    static bool isBothAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate)
+    {
+        return candidate->readWithFewerHitsGenomeLocation != candidate->readWithFewerHitsUnliftedGenomeLocation &&
+            mate->readWithMoreHitsGenomeLocation != mate->readWithMoreHitsUnliftedGenomeLocation;
+    }
+
     //
     // A pool of scoring candidates.  For each alignment call, we free them all by resetting lowestFreeScoringCandidatePoolEntry to 0,
     // and then fill in the content when they're initialized.  This means that for alignments with few candidates we'll be using the same

From 6fbf18a49e82fbea4d6d8cc46f42fbdd79095b32 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Sun, 6 Dec 2015 21:25:09 -0800
Subject: [PATCH 02/19] Read genome with alts

---
 SNAPLib/FASTA.cpp                        |  37 +--
 SNAPLib/FASTA.h                          |   2 +-
 SNAPLib/Genome.cpp                       | 307 ++++++++++++++++++++++-
 SNAPLib/Genome.h                         |  51 +++-
 SNAPLib/GenomeIndex.cpp                  | 137 +++++++++-
 SNAPLib/GenomeIndex.h                    |  15 +-
 SNAPLib/IntersectingPairedEndAligner.cpp |  30 +--
 SNAPLib/IntersectingPairedEndAligner.h   |  14 +-
 8 files changed, 514 insertions(+), 79 deletions(-)

diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp
index 25418eef..75d92a08 100644
--- a/SNAPLib/FASTA.cpp
+++ b/SNAPLib/FASTA.cpp
@@ -36,7 +36,8 @@ ReadFASTAGenome(
     const char *fileName,
     const char *pieceNameTerminatorCharacters,
     bool spaceIsAPieceNameTerminator,
-    unsigned chromosomePaddingSize)
+    unsigned chromosomePaddingSize,
+    AltContigMap* altMap)
 {
     //
     // We need to know a bound on the size of the genome before we create the Genome object.
@@ -96,33 +97,12 @@ ReadFASTAGenome(
             //
             // Now supply the chromosome name.
             //
-            if (NULL != pieceNameTerminatorCharacters) {
-                for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
-                    char *terminator = strchr(lineBuffer+1, pieceNameTerminatorCharacters[i]);
-                    if (NULL != terminator) {
-                        *terminator = '\0';
-                    }
-                }
-            }
-            if (spaceIsAPieceNameTerminator) {
-                char *terminator = strchr(lineBuffer, ' ');
-                if (NULL != terminator) {
-                    *terminator = '\0';
-                }
-                terminator = strchr(lineBuffer, '\t');
-                if (NULL != terminator) {
-                    *terminator = '\0';
-                }
-            }
-            char *terminator = strchr(lineBuffer, '\n');
-            if (NULL != terminator) {
-                *terminator = '\0';
+            char * terminator = Genome::findTerminator(lineBuffer, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator);
+            if (altMap != NULL) {
+                altMap->addFastaContig(lineBuffer, terminator);
             }
-            terminator = strchr(lineBuffer, '\r');
-            if (NULL != terminator) {
-                *terminator = '\0';
-            }
-            genome->startContig(lineBuffer+1);
+            *terminator = 0;
+            genome->startContig(lineBuffer+1, altMap);
         } else {
             if (!inAContig) {
                 WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n");
@@ -170,6 +150,9 @@ ReadFASTAGenome(
     //
     genome->addData(paddingBuffer);
     genome->fillInContigLengths();
+    if (altMap != NULL) {
+        genome->adjustAltContigs(altMap);
+    }
     genome->sortContigsByName();
 
     fclose(fastaFile);
diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h
index 44cb810f..cda9a1f8 100644
--- a/SNAPLib/FASTA.h
+++ b/SNAPLib/FASTA.h
@@ -27,7 +27,7 @@ Revision History:
 #include "Genome.h"
 
     const Genome *
-ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize);
+ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, AltContigMap* altMap);
 
 //
 // The FASTA appending functions return whether the write was successful.
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 4a0629ec..07460392 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -31,6 +31,7 @@ Revision History:
 #include "exit.h"
 #include "Error.h"
 #include "Util.h"
+#include "VariableSizeVector.h"
 
 Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs)
 : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs),
@@ -73,7 +74,7 @@ Genome::addData(const char *data)
 }
 
     void
-Genome::startContig(const char *contigName)
+Genome::startContig(const char *contigName, AltContigMap *altMap)
 {
     if (nContigs == maxContigs) {
         //
@@ -102,6 +103,10 @@ Genome::startContig(const char *contigName)
     strncpy(contigs[nContigs].name,contigName,len);
     contigs[nContigs].name[len-1] = '\0';
 
+    if (altMap != NULL) {
+        altMap->setAltContig(&contigs[nContigs]);
+    }
+
     nContigs++;
 }
 
@@ -465,6 +470,78 @@ void Genome::fillInContigLengths()
     contigs[nContigs-1].length = nBases - GenomeLocationAsInt64(contigs[nContigs-1].beginningLocation);
 }
 
+void Genome::adjustAltContigs(AltContigMap* altMap)
+{
+    if (altMap == NULL) {
+        return;
+    }
+    bool error = false;
+    // build parent links from alt contigs
+    for (int i = 0; i < nContigs; i++) {
+        if (contigs[i].isAlternate) {
+            const char* parentName = altMap->getParentContigName(contigs[i].name);
+            if (parentName == NULL) {
+                WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name);
+                error = true;
+                continue;
+            }
+            GenomeLocation parentLocation;
+            int parentIndex;
+            if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) {
+                WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name);
+                error = true;
+                continue;
+            }
+            if (contigs[parentIndex].isAlternate) {
+                WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName);
+                error = true; continue;
+            }
+            contigs[i].liftedLocation = parentLocation;
+        }
+    }
+    if (error) {
+        soft_exit(1);
+    }
+
+    // flip RC contigs
+    for (int i = 0; i < nContigs; i++) {
+        if (contigs[i].isAlternate && contigs[i].isReverseStrand) {
+            util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length);
+        }
+    }
+}
+
+char * Genome::findTerminator(char* lineBuffer, const char* pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator)
+{
+    char* result = lineBuffer + strlen(lineBuffer);
+    if (NULL != pieceNameTerminatorCharacters) {
+        for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
+            char *terminator = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
+            if (NULL != terminator && terminator < result) {
+                result = terminator;
+            }
+        }
+    }
+    if (spaceIsAPieceNameTerminator) {
+        char *terminator = strchr(lineBuffer, ' ');
+        if (NULL != terminator && terminator < result) {
+            result = terminator;
+        }
+        terminator = strchr(lineBuffer, '\t');
+        if (NULL != terminator && terminator < result) {
+            result = terminator;
+        }
+    }
+    char *terminator = strchr(lineBuffer, '\n');
+    if (NULL != terminator) {
+        result = terminator;
+    }
+    terminator = strchr(lineBuffer, '\r');
+    if (NULL != terminator) {
+        result = terminator;
+    }
+    return result;
+}
 const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const 
 {
     const Contig *contig = getContigAtLocation(location);
@@ -491,4 +568,230 @@ const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned
     return contig;
 }
 
-GenomeLocation InvalidGenomeLocation;   // Gets set on genome build/load
\ No newline at end of file
+GenomeLocation InvalidGenomeLocation;   // Gets set on genome build/load
+
+// terminate string at next tab or newline
+// return pointer to beginning of next chunk of data
+char* tokenizeToNextTabOrNewline(char* start, bool* endOfLine, bool* endOfFile)
+{
+    char* p = start;
+    while (*p) {
+        if (*p == '\t') {
+            *p = '\0';
+            *endOfLine = false;
+            *endOfFile = false;
+            return p + 1;
+        } else if (*p == '\r' || *p == '\n') {
+            if (*(p + 1) != *p && (*(p + 1) == '\r' || *(p + 1) == '\n')) {
+                *p++ = '\0';
+            } else {
+            }
+            *p = '\0';
+            *endOfLine = true;
+            *endOfFile = false;
+            return p + 1;
+        }
+        p++;
+    }
+    *endOfLine = true;
+    *endOfFile = true;
+    return p;
+}
+
+static const int ALT_SCAF_ACC   = 0;
+static const int PARENT_ACC     = 1;
+static const int ORI            = 2;
+static const int ALT_SCAF_START = 3;
+static const int ALT_SCAF_STOP  = 4;
+static const int PARENT_START   = 5;
+static const int PARENT_STOP    = 6;
+static const int ALT_START_TAIL = 7;
+static const int ALT_STOP_TAIL  = 8;
+static const int N_COLUMNS      = 9;
+
+AltContigMap* AltContigMap::readFromFile(const char* filename, const char* columns)
+{
+    // just map & copy the whole file into a region of memory
+    FileMapper map;
+    if (!map.init(filename)) {
+err_map_failed:
+        WriteErrorMessage("Failed to map file %s\n", filename);
+        soft_exit(1);
+    }
+    _int64 size = map.getFileSize();
+    char* buffer = (char*)BigAlloc(size + 1 + strlen(columns) + 1);
+    void* token;
+    char* mapped = map.createMapping(0, size, &token);
+    if (mapped == NULL) {
+        goto err_map_failed;
+    }
+    memcpy(buffer, mapped, size);
+    buffer[size] = '\0';
+    if (strlen(buffer) != size) {
+        WriteErrorMessage("Nulls in file %s\n", filename);
+        soft_exit(1);
+    }
+    map.unmap(token);
+    strcpy(buffer + size + 1, columns);
+
+    AltContigMap* result = new AltContigMap();
+    // first find accession FASTA tag, add "|"
+    char* p = buffer + size + 1;
+    if (*p == '#') {
+        p++;
+    }
+    char* q = strchr(p, ',');
+    if (q == NULL) {
+err_invalid_column_spec:
+        WriteErrorMessage("Invalid columns spec %s\n", columns);
+        soft_exit(1);
+    }
+    *q = '\0';
+    result->accessionFastaTag = p;
+
+    // get names for each column type (last 2 are optional)
+    p = q + 1;
+    char* columnNames[N_COLUMNS];
+    memset(columnNames, 0, sizeof(char*) * N_COLUMNS);
+    for (int i = 0; i < N_COLUMNS; i++) {
+        columnNames[i] = p;
+        q = strchr(p, ',');
+        if (q != NULL) {
+            *q = '\0';
+            p = q + 1;
+        } else {
+            q = p + strlen(p);
+            if (i < PARENT_STOP) {
+                goto err_invalid_column_spec;
+            }
+            break;
+        }
+    }
+
+    // map column names to indices
+    VariableSizeVector<int> columnTypes;
+    p = buffer + (*buffer == '#'); // beginning of buffer, skipping possible comment char
+    bool endOfLine = false, endOfFile = false;
+    bool columnFound[N_COLUMNS];
+    memset(columnFound, 0, sizeof(bool)* N_COLUMNS);
+    for (int columnIndex = 0; !endOfLine; columnIndex++) {
+        q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile);
+        if (q == NULL) {
+err_file_format:
+            WriteErrorMessage("Invalid file format for alt data in %s\n", filename);
+            soft_exit(1);
+        }
+        *q = '\0';
+        for (int i = 0; i <= N_COLUMNS; i++) {
+            if (i < N_COLUMNS && !strcmp(columnNames[i], p)) {
+                columnTypes.add(i);
+                columnFound[i] = true;
+                break;
+            } else if (i == N_COLUMNS) {
+                columnTypes.add(N_COLUMNS); // ignore this column
+            }
+        }
+        p = q;
+    }
+    for (int i = 0; i < N_COLUMNS; i++) {
+        if (columnNames[i] != NULL && !columnFound[i]) {
+            goto err_file_format;
+        }
+    }
+    while (!endOfFile) {
+        endOfLine = false;
+        AltContig alt;
+        for (int columnIndex = 0; !endOfLine; columnIndex++) {
+            q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile);
+            switch (columnTypes[columnIndex]) {
+            case ALT_SCAF_ACC:
+                alt.accession = p;
+                break;
+            case PARENT_ACC:
+                alt.parentAccession = p;
+                break;
+            case ORI:
+                alt.isRC = *p == '-';
+                break;
+            case ALT_SCAF_START:
+                alt.start = atol(p);
+                break;
+            case ALT_SCAF_STOP:
+                alt.stop = atol(p);
+                break;
+            case PARENT_START:
+                alt.parentStart = atol(p);
+                break;
+            case PARENT_STOP:
+                alt.parentStop = atol(p);
+                break;
+            case ALT_START_TAIL:
+                alt.startTail = atol(p);
+                break;
+            case ALT_STOP_TAIL:
+                alt.stopTail = atol(p);
+                break;
+            case N_COLUMNS:
+                // ignore
+                break;
+            default:
+                _ASSERT(false);
+            }
+            p = q;
+        }
+        result->altsByAccession[alt.accession] = alt;
+    }
+    return result;
+}
+
+void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTerminator)
+{
+    // get the name
+    char* name = (char*) malloc(nameTerminator - lineBuffer);
+    memcpy(name, lineBuffer + 1, nameTerminator - lineBuffer - 1);
+    name[nameTerminator - lineBuffer - 1] = 0;
+
+    // find the accession number
+    const char* tag = strstr(lineBuffer, accessionFastaTag);
+    const char* p = tag + strlen(accessionFastaTag);
+    if (tag == NULL || *p == '\0') {
+        WriteErrorMessage("Unable to find accession code for contig %s in FASTA line\n%s\n", name, lineBuffer);
+        soft_exit(1);
+    }
+    const char*q = p;
+    while (*q != '\0' && *q != '|' && *q != ' ' && *q != '\t' && *q != '\r' && *q != '\n') {
+        q++;
+    }
+    char* accession = (char*)malloc(q - p);
+    memcpy(accession, p, q - p - 1);
+    *(accession + (q - p)) = '\0';
+
+    nameToAccession[name] = accession;
+}
+
+void AltContigMap::setAltContig(Genome::Contig* contig)
+{
+    StringMap::iterator accession = nameToAccession.find(contig->name);
+    if (accession != nameToAccession.end()) {
+        StringAltContigMap::iterator alt = altsByAccession.find(accession->second);
+        if (alt != altsByAccession.end()) {
+            contig->isAlternate = true;
+            contig->isReverseStrand = alt->second.isRC;
+            return;
+        }
+    }
+    contig->isAlternate = false;
+    contig->isReverseStrand = false;
+}
+
+const char* AltContigMap::getParentContigName(const char* altName)
+{
+    StringMap::iterator accession = nameToAccession.find(altName);
+    if (accession != nameToAccession.end()) {
+        StringAltContigMap::iterator alt = altsByAccession.find(accession->second);
+        if (alt != altsByAccession.end()) {
+            return alt->second.name;
+        }
+    }
+    return NULL;
+}
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index d65d9008..f2ee8a67 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -26,6 +26,8 @@ Revision History:
 #include "Compat.h"
 #include "GenericFile.h"
 #include "GenericFile_map.h"
+#include <string>
+#include <map>
 
 //
 // We have two different classes to represent a place in a genome and a distance between places in a genome.
@@ -156,6 +158,8 @@ typedef _int64 GenomeDistance;
 
 extern GenomeLocation InvalidGenomeLocation;
 
+class AltContigMap;
+
 class Genome {
 public:
         //
@@ -174,7 +178,8 @@ class Genome {
             unsigned                maxContigs = 32);
 
         void startContig(
-            const char          *contigName);
+            const char          *contigName,
+            AltContigMap        *altMap);
 
         void addData(
             const char          *data);
@@ -246,14 +251,14 @@ class Genome {
 
         struct Contig {
             Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL),
-                    isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {}
+                    isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation) {}
             GenomeLocation     beginningLocation;
             GenomeDistance     length;
+
             bool               isAlternate;
-            int                altGroup; // each group of overlapping alt regions is given a unique ID
             bool               isReverseStrand; // if reversed alternate strand
             GenomeLocation     liftedLocation; // location of beginning of alt contig mapping to primary
-            GenomeLocation     contextBefore, contextAfter;   // context sequence added from primary (alts near ends have less context)
+            
             unsigned           nameLength;
             char              *name;
         };
@@ -278,8 +283,11 @@ class Genome {
         // These are only public so creators of new genomes (i.e., FASTA) can use them.
         //
         void    fillInContigLengths();
+        void    adjustAltContigs(AltContigMap* altMap);
         void    sortContigsByName();
 
+        static char* findTerminator(char* buffer, const char* terminators, bool whitespaceTerminator);
+
 private:
 
         static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it
@@ -317,3 +325,38 @@ inline bool genomeLocationIsWithin(GenomeLocation locationA, GenomeLocation loca
 {
     return DistanceBetweenGenomeLocations(locationA, locationB) <= distance;
 }
+
+class AltContigMap
+{
+public:
+    AltContigMap() {}
+
+    static AltContigMap* readFromFile(const char* filename, const char* columnList);
+
+    void addFastaContig(const char* lineBuffer, const char* terminator);
+
+    void setAltContig(Genome::Contig* contig);
+
+    const char* getParentContigName(const char* altName);
+
+private:
+
+    struct AltContig {
+        const char* name;
+        const char* accession;
+        const char* parentAccession;
+        bool isRC;
+        GenomeLocation start, stop;
+        GenomeLocation parentStart, parentStop;
+        GenomeLocation startTail, stopTail;
+        AltContig() : name(NULL), accession(NULL), parentAccession(NULL), isRC(false),
+            start(0), stop(0), parentStart(0), parentStop(0), startTail(0), stopTail(0) {}
+    };
+
+
+    const char* accessionFastaTag;
+    typedef std::map<std::string, AltContig> StringAltContigMap;
+    StringAltContigMap altsByAccession;
+    typedef std::map<std::string, std::string> StringMap;
+    StringMap nameToAccession;
+};
\ No newline at end of file
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 72f05bd6..8ca51603 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -46,11 +46,12 @@ static const double DEFAULT_SLACK = 0.3;
 static const unsigned DEFAULT_PADDING = 500;
 static const unsigned DEFAULT_KEY_BYTES = 4;
 static const unsigned DEFAULT_LOCATION_SIZE = 4;
-
+static const char* DEFAULT_ALT_COLUMNS = "gb,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail";
 const char *GenomeIndexFileName = "GenomeIndex";
 const char *OverflowTableFileName = "OverflowTable";
 const char *GenomeIndexHashFileName = "GenomeIndexHash";
 const char *GenomeFileName = "Genome";
+const char *LiftedIndexDirName = "Lifted";
 
 static void usage()
 {
@@ -85,12 +86,16 @@ static void usage()
 		"                   In particular, this will generally use less memory than the index will use once it's built, so if this doesn't work you\n"
 		"                   won't be able to use the index anyway. However, if you've got sufficient memory to begin with, this option will just\n"
 		"                   slow down the index build by doing extra, useless IO.\n"
+        "-altmap file       Tab-separated file of alt contig mapping information\n"
+        "-altcols columns   Comma-separated list of columns describing alt mapping file\n"
+        "                   Default is v38 %s\n"
 			,
             DEFAULT_SEED_SIZE,
             DEFAULT_SLACK,
             DEFAULT_PADDING,
             DEFAULT_KEY_BYTES,
-            DEFAULT_LOCATION_SIZE);
+            DEFAULT_LOCATION_SIZE,
+            DEFAULT_ALT_COLUMNS);
     soft_exit_no_print(1);    // Don't use soft-exit, it's confusing people to get an error message after the usage
 }
 
@@ -121,6 +126,8 @@ GenomeIndex::runIndexer(
 	bool large = false;
     unsigned locationSize = DEFAULT_LOCATION_SIZE;
 	bool smallMemory = false;
+    const char* altMapFilename = NULL;
+    const char* altMapColumns = DEFAULT_ALT_COLUMNS;
 
     for (int n = 2; n < argc; n++) {
         if (strcmp(argv[n], "-s") == 0) {
@@ -172,8 +179,7 @@ GenomeIndex::runIndexer(
 			}
 		} else if (argv[n][0] == '-' && argv[n][1] == 's' && argv[n][2] == 'm') {
 			smallMemory = true;
-		}
-		else if (strcmp(argv[n], "-keysize") == 0) {
+		} else if (strcmp(argv[n], "-keysize") == 0) {
             if (n + 1 < argc) {
                 keySizeInBytes = atoi(argv[n+1]);
                 if (keySizeInBytes < 4 || keySizeInBytes > 8) {
@@ -188,6 +194,20 @@ GenomeIndex::runIndexer(
             pieceNameTerminatorCharacters = argv[n] + 2;
         } else if (!strcmp(argv[n], "-bSpace")) {
             spaceIsAPieceNameTerminator = true;
+        } else if (!strcmp(argv[n], "-altmap")) {
+            if (n + 1 < argc) {
+                altMapFilename = argv[n + 1];
+                n++;
+            } else {
+                usage();
+            }
+        } else if (!strcmp(argv[n], "-altcols")) {
+            if (n + 1 < argc) {
+                altMapColumns = argv[n + 1];
+                n++;
+            } else {
+                usage();
+            }
         } else {
             WriteErrorMessage("Invalid argument: %s\n\n", argv[n]);
             usage();
@@ -223,7 +243,10 @@ GenomeIndex::runIndexer(
     BigAllocUseHugePages = false;
 
     _int64 start = timeInMillis();
-    const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding);
+
+    AltContigMap* altMap = altMapFilename != NULL ? AltContigMap::readFromFile(altMapFilename, altMapColumns) : NULL;
+
+    const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, altMap);
     if (NULL == genome) {
         WriteErrorMessage("Unable to read FASTA file\n");
         soft_exit(1);
@@ -261,12 +284,17 @@ SetInvalidGenomeLocation(unsigned locationSize)
     bool
 GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double slack, bool computeBias, const char *directoryName,
                                     unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, unsigned hashTableKeySize, 
-									bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory)
+									bool large, const char *histogramFileName, unsigned locationSize, bool smallMemory, GenomeIndex* unliftedIndex)
 {
 	PreventMachineHibernationWhileThisThreadIsAlive();
 
     SetInvalidGenomeLocation(locationSize);
 
+    if (genome->hasAltContigs() && smallMemory) {
+        WriteErrorMessage("Warning: Cannot use small memory to build index with alt contigs, ignoring flag\n");
+        smallMemory = false;
+    }
+
     bool buildHistogram = (histogramFileName != NULL);
     FILE *histogramFile;
     if (buildHistogram) {
@@ -282,7 +310,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         return false;
     }
 
-    int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1);
+    int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), __max(strlen(GenomeFileName), strlen(LiftedIndexDirName))))) + 1);
     char *filenameBuffer = new char[filenameBufferSize];
     
 	fprintf(stderr,"Saving genome...");
@@ -421,6 +449,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		threadContexts[i].backpointerSpillLock = &backpointerSpillLock;
 		threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread;
 		threadContexts[i].backpointerSpillFile = backpointerSpillFile;
+        threadContexts[i].unliftedIndex = unliftedIndex;
 
         StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]);
     }
@@ -742,15 +771,27 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
     fclose(indexFile);
  
+    if (genome->hasAltContigs() && unliftedIndex != NULL) {
+        // create a sub-index with only seeds that occur in alt contigs
+        snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
+        bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
+            hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
+        if (!ok) {
+            WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
+            soft_exit(1);
+            return false;
+        }
+    }
+
     delete index;
     if (computeBias && biasTable != NULL) {
         delete[] biasTable;
     }
- 
+
     WriteStatusMessage("%llds\n", (timeInMillis() + 500 - start) / 1000);
 
     delete[] filenameBuffer;
-    
+
     return true;
 }
 
@@ -845,7 +886,7 @@ SNAPHashTable** GenomeIndex::allocateHashTables(
 
 
 
-GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL)
+GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(FALSE), liftedIndex(NULL)
 {
 }
 
@@ -885,6 +926,9 @@ GenomeIndex::~GenomeIndex()
 	delete genome;
 	genome = NULL;
 
+    if (NULL != liftedIndex) {
+        delete liftedIndex;
+    }
 }
 
     void
@@ -1171,6 +1215,7 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
     const Genome *genome = context->genome;
     unsigned seedLen = context->seedLen;
 	bool large = context->large;
+    bool lift = context->unliftedIndex != NULL;
  
     //
     // Batch the insertions into the hash tables, because otherwise we spend all of
@@ -1202,7 +1247,12 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
 
 		Seed seed(bases, seedLen);
 
-        indexSeed(genomeLocation, seed, batches, context, &stats, large);
+        if (!lift) {
+            indexSeed(genomeLocation, seed, batches, context, &stats, large);
+        }
+        else {
+            indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large);
+        }
     } // For each genome base in our area
 
     //
@@ -1224,9 +1274,8 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
     }
 
 }
-    
-const _int64 GenomeIndex::printPeriod = 100000000;
 
+const _int64 GenomeIndex::printPeriod = 100000000;
 
 
     void
@@ -1259,6 +1308,48 @@ GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBat
 	} // If we filled a batch
 }
 
+    void
+GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
+{
+    // todo: optimize
+    // if this is first occurrence of seed in unlifted index, checks if seed is in any alts
+    // and if so, adds all locations to this index, lifting alts to non-alt locations
+
+    _int64 nHits, nRCHits;
+    if (doesGenomeIndexHave64BitLocations()) {
+        const GenomeLocation *hits, *rcHits;
+        GenomeLocation singleHit[2], singleRCHit[2];
+        context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
+#define CHECK_ALTS_AND_ADD_LIFTED \
+        if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { \
+            bool anyAlts = false; \
+            for (int i = 0; i < nHits && ! anyAlts; i++) { \
+                anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \
+            } \
+            for (int i = 0; i < nRCHits && !anyAlts; i++) { \
+                anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \
+            } \
+            if (anyAlts) { \
+                for (int i = 0; i < nHits && !anyAlts; i++) { \
+                    indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); \
+                } \
+                if (!seed.isOwnReverseComplement()) { \
+                    Seed rcSeed = ~seed; \
+                    for (int i = 0; i < nRCHits && !anyAlts; i++) { \
+                        indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); \
+                    } \
+                } \
+            } \
+        }
+        CHECK_ALTS_AND_ADD_LIFTED
+    }
+    else {
+        const unsigned *hits, *rcHits;
+        context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
+        CHECK_ALTS_AND_ADD_LIFTED
+    }
+}
+
         void 
 GenomeIndex::ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
                 _int64 *bothComplementsUsed, _int64 *genomeLocationsInOverflowTable, _int64 *seedsWithMultipleOccurrences, bool large)
@@ -1929,6 +2020,16 @@ GenomeIndex::lookupSeedAlt32(
     lookupSeed32(seed, nHits, hits, nRCHits, rcHits);
     *unliftedHits = *hits;
     *unliftedRCHits = *rcHits;
+    if (hasAlts) {
+        _int64 nLiftedHits, nLiftedRCHits;
+        const unsigned *liftedHits, *liftedRCHits;
+        liftedIndex->lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits);
+        if (nLiftedHits != 0 || nLiftedRCHits != 0) {
+            _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits);
+            *hits = liftedHits;
+            *rcHits = liftedRCHits;
+        }
+    }
 }
 
     void
@@ -2073,6 +2174,16 @@ GenomeIndex::lookupSeedAlt(
     lookupSeed(seed, nHits, hits, nRCHits, rcHits, singleHit, singleRCHit);
     *unliftedHits = *hits;
     *unliftedRCHits = *rcHits;
+    if (hasAlts) {
+        _int64 nLiftedHits, nLiftedRCHits;
+        const GenomeLocation *liftedHits, *liftedRCHits;
+        liftedIndex->lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, singleHit + 1, singleRCHit + 1);
+        if (nLiftedHits != 0 || nLiftedRCHits != 0) {
+            _ASSERT(nLiftedHits == *nHits && nLiftedRCHits == *nRCHits);
+            *hits = liftedHits;
+            *rcHits = liftedRCHits;
+        }
+    }
 }
 
     void 
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index 2cdd0782..49eea33b 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -93,6 +93,12 @@ class GenomeIndex {
     unsigned nHashTables;
     const Genome *genome;
 
+    // TRUE if genome has alt contigs
+    bool hasAlts;
+
+    // secondary index for all seeds that map to alt contigs with locations lifted to non-alt contigs
+    GenomeIndex* liftedIndex;
+
     bool largeHashTable;
     unsigned locationSize;
 
@@ -154,12 +160,13 @@ class GenomeIndex {
     // Build a genome index and write it to a directory.  If you don't already have a saved index
     // the only way to get one is to build it into a directory and then load it from the directory.
     // NB: This deletes the Genome that's passed into it.
+    // unliftedIndex is an internal parameter used to build 2-level index for genomes with alt contigs
     //
     static bool BuildIndexToDirectory(const Genome *genome, int seedLen, double slack,
-                                      bool computeBias, const char *directory,
+                                      bool computeBias, const char *directoryName,
                                       unsigned maxThreads, unsigned chromosomePaddingSize, bool forceExact, 
                                       unsigned hashTableKeySize, bool large, const char *histogramFileName,
-                                      unsigned locationSize, bool smallMemory);
+                                      unsigned locationSize, bool smallMemory, GenomeIndex *unliftedIndex = NULL);
 
  
     //
@@ -233,6 +240,9 @@ class GenomeIndex {
 		ExclusiveLock					*backpointerSpillLock;
 		FILE							*backpointerSpillFile;
 
+        // used for building sub-index of only seeds that occur in alt contigs
+        GenomeIndex                     *unliftedIndex;
+
         ExclusiveLock                   *hashTableLocks;
         ExclusiveLock                   *overflowTableLock;
     };
@@ -281,6 +291,7 @@ class GenomeIndex {
     static const _int64 printPeriod;
 
     virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
+    virtual void indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
     virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
 
     static void BuildHashTablesWorkerThreadMain(void *param);
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index a0d1c759..c0c30b3e 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -342,17 +342,14 @@ IntersectingPairedEndAligner::align(
                 if (doesGenomeIndexHave64BitLocations) {
                     index->lookupSeed(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC],
                         hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
-                }
-                else {
+                } else {
                     index->lookupSeed32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC]);
                 }
-            }
-            else {
+            } else {
                 if (doesGenomeIndexHave64BitLocations) {
                     index->lookupSeedAlt(seed, &nHits[FORWARD], &hits[FORWARD], &nHits[RC], &hits[RC], &unliftedHits[FORWARD], &unliftedHits[RC],
                         hashTableHitSets[whichRead][FORWARD]->getNextSingletonLocation(), hashTableHitSets[whichRead][RC]->getNextSingletonLocation());
-                }
-                else {
+                } else {
                     index->lookupSeedAlt32(seed, &nHits[FORWARD], &hits32[FORWARD], &nHits[RC], &hits32[RC], &unliftedHits32[FORWARD], &unliftedHits32[RC]);
                 }
             }
@@ -370,16 +367,13 @@ IntersectingPairedEndAligner::align(
                     if (!doesGenomeIndexHaveAlts) {
                         if (doesGenomeIndexHave64BitLocations) {
                             hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], NULL, beginsDisjointHitSet[dir]);
-                        }
-                        else {
+                        } else {
                             hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], NULL, beginsDisjointHitSet[dir]);
                         }
-                    }
-                    else {
+                    } else {
                         if (doesGenomeIndexHave64BitLocations) {
                             hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits[dir], unliftedHits[dir], beginsDisjointHitSet[dir]);
-                        }
-                        else {
+                        } else {
                             hashTableHitSets[whichRead][dir]->recordLookup(offset, nHits[dir], hits32[dir], unliftedHits32[dir], beginsDisjointHitSet[dir]);
                         }
                     }
@@ -687,11 +681,11 @@ IntersectingPairedEndAligner::align(
 
                         // reduce probability of pairs matching across different overlapping alts
                         // todo: assuming if they're on different alts within maxSpacing they overlap - true for GRCh38 but not necessarily for all genomes
-                        // use crossover probability with 1 centiMorgan ~= 1Mbp
-                        if (doesGenomeIndexHaveAlts && isBothAltPairMapping(candidate, mate) &&
-                            abs(mate->readWithMoreHitsUnliftedGenomeLocation - candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) 
+                        // use crossover probability with 1 centiMorgan ~= 1Mbp - too strict?
+                        if (doesGenomeIndexHaveAlts && candidate->isAlt() && mate->isAlt() &&
+                            DistanceBetweenGenomeLocations(mate->readWithMoreHitsUnliftedGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation) > 2*maxSpacing) 
                         {
-                            pairProbability *= 1e-8 * abs(candidate->readWithFewerHitsGenomeLocation - mate->readWithMoreHitsGenomeLocation);
+                            pairProbability *= 1e-8 * DistanceBetweenGenomeLocations(candidate->readWithFewerHitsGenomeLocation, mate->readWithMoreHitsGenomeLocation);
                         }
 
                         //
@@ -753,7 +747,7 @@ IntersectingPairedEndAligner::align(
                             candidate->mergeAnchor = mergeAnchor;
                         } else {
                             merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset,
-                                pairProbability, pairScore, doesGenomeIndexHaveAlts && isNonAltPairMapping(candidate, mate), &oldPairProbability);
+                                pairProbability, pairScore, doesGenomeIndexHaveAlts && (! candidate->isAlt()) && (!mate->isAlt()), &oldPairProbability);
                         }
 
                         if (!merged) {
@@ -768,7 +762,7 @@ IntersectingPairedEndAligner::align(
 
                             if (pairScore <= maxK && (pairScore < bestPairScore ||
                                 (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair ||
-                                (pairProbability == probabilityOfBestPair && isNonAltPairMapping(candidate, mate)))))) {
+                                (pairProbability == probabilityOfBestPair && (! candidate->isAlt()) && (!mate->isAlt())))))) {
                                 //
                                 // A new best hit.
                                 //
diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h
index e951f64b..403e28e6 100644
--- a/SNAPLib/IntersectingPairedEndAligner.h
+++ b/SNAPLib/IntersectingPairedEndAligner.h
@@ -414,6 +414,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
             matchProbability = 0;
             genomeOffset = 0;
         }
+        bool isAlt() const { return readWithMoreHitsGenomeLocation != readWithMoreHitsUnliftedGenomeLocation; }
     };
 
     struct ScoringCandidate {
@@ -440,20 +441,9 @@ class IntersectingPairedEndAligner : public PairedEndAligner
             scoreListNext = scoreListNext_;
             mergeAnchor = NULL;
          }
+        bool isAlt() const { return readWithFewerHitsGenomeLocation != readWithFewerHitsUnliftedGenomeLocation; }
     };
 
-    static bool isNonAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate)
-    {
-        return candidate->readWithFewerHitsGenomeLocation == candidate->readWithFewerHitsUnliftedGenomeLocation &&
-            mate->readWithMoreHitsGenomeLocation == mate->readWithMoreHitsUnliftedGenomeLocation;
-    }
-
-    static bool isBothAltPairMapping(ScoringCandidate* candidate, ScoringMateCandidate* mate)
-    {
-        return candidate->readWithFewerHitsGenomeLocation != candidate->readWithFewerHitsUnliftedGenomeLocation &&
-            mate->readWithMoreHitsGenomeLocation != mate->readWithMoreHitsUnliftedGenomeLocation;
-    }
-
     //
     // A pool of scoring candidates.  For each alignment call, we free them all by resetting lowestFreeScoringCandidatePoolEntry to 0,
     // and then fill in the content when they're initialized.  This means that for alignments with few candidates we'll be using the same

From 3046b3081c7bdb1fc9160ef3481b2e2434eb1356 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Mon, 7 Dec 2015 11:07:29 -0800
Subject: [PATCH 03/19] Genome index read/write

---
 SNAPLib/FASTA.cpp       |  31 ++++++++++-
 SNAPLib/Genome.cpp      | 118 +++++++++++++++++++++-------------------
 SNAPLib/Genome.h        |  12 ++--
 SNAPLib/GenomeIndex.cpp |  67 ++++++++++++++++-------
 SNAPLib/GenomeIndex.h   |   8 ++-
 5 files changed, 149 insertions(+), 87 deletions(-)

diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp
index 75d92a08..659bc990 100644
--- a/SNAPLib/FASTA.cpp
+++ b/SNAPLib/FASTA.cpp
@@ -97,11 +97,38 @@ ReadFASTAGenome(
             //
             // Now supply the chromosome name.
             //
-            char * terminator = Genome::findTerminator(lineBuffer, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator);
+            char * terminator = lineBuffer + strlen(lineBuffer);
+            char * p;
+            if (NULL != pieceNameTerminatorCharacters) {
+                for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
+                    p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
+                    if (NULL != p && p < terminator) {
+                        terminator = p;
+                    }
+                }
+            }
+            if (spaceIsAPieceNameTerminator) {
+                p = strchr(lineBuffer, ' ');
+                if (NULL != p && p < terminator) {
+                    terminator = p;
+                }
+                p = strchr(lineBuffer, '\t');
+                if (NULL != p && p < terminator) {
+                    terminator = p;
+                }
+            }
+            p = strchr(lineBuffer, '\n');
+            if (NULL != p && p < terminator) {
+                terminator = p;
+            }
+            p = strchr(lineBuffer, '\r');
+            if (NULL != p && p < terminator) {
+                terminator = p;
+            }
             if (altMap != NULL) {
                 altMap->addFastaContig(lineBuffer, terminator);
             }
-            *terminator = 0;
+            *terminator = '\0';
             genome->startContig(lineBuffer+1, altMap);
         } else {
             if (!inAContig) {
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 07460392..2da7421a 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -35,7 +35,7 @@ Revision History:
 
 Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs)
 : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs),
-  mappedFile(NULL)
+mappedFile(NULL), minAltLocation(i_maxBases)
 {
     bases = ((char *) BigAlloc(nBasesStored + 2 * N_PADDING)) + N_PADDING;
     if (NULL == bases) {
@@ -154,7 +154,13 @@ Genome::saveToFile(const char *fileName) const
          curChar = contigs[i].name + n;
          if (*curChar == ' '){ *curChar = '_'; }
         }
-        fprintf(saveFile,"%lld %s\n",contigs[i].beginningLocation, contigs[i].name);
+        if (!hasAltContigs()) {
+            // backward compatible for genomes without alts
+            fprintf(saveFile, "%lld %s\n", contigs[i].beginningLocation, contigs[i].name);
+        } else {
+            fprintf(saveFile, "%lld %s %d %d %lld\n", contigs[i].beginningLocation, contigs[i].name,
+                contigs[i].isAlternate ? 1 : 0, contigs[i].isAlternateRC ? 1 : 0, contigs[i].liftedLocation);
+        }
     }
 
 	//
@@ -223,9 +229,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
 
     int contigNameBufferSize = 0;
     char *contigNameBuffer = NULL;
-    unsigned n;
-    size_t contigSize;
-    char *curName;
+    genome->minAltLocation = nBases;
     for (unsigned i = 0; i < nContigs; i++) {
         if (NULL == reallocatingFgetsGenericFile(&contigNameBuffer, &contigNameBufferSize, loadFile)) {	 
             WriteErrorMessage("Unable to read contig description\n");
@@ -234,29 +238,48 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
             return NULL;
         }
 
-        for (n = 0; n < (unsigned)contigNameBufferSize; n++) {
-	        if (contigNameBuffer[n] == ' ') {
-	            contigNameBuffer[n] = '\0'; 
-	            break;
-	        }
-	    }
-
+        contigNameBuffer[contigNameBufferSize - 1] = '\0';
         _int64 contigStart;
-        if (1 != sscanf(contigNameBuffer, "%lld", &contigStart)) {
-            WriteErrorMessage("Unable to parse contig start in genome file '%s', '%s%'\n", fileName, contigNameBuffer);
+        const char* SEP = " \n\r";
+        char *token = strtok(contigNameBuffer, SEP);
+        if (token == NULL || 1 != sscanf(token, "%lld", &contigStart)) {
+err_contig_parse:
+            WriteErrorMessage("Unable to parse contigs in genome file '%s', '%s%'\n", fileName, contigNameBuffer);
             soft_exit(1);
         }
         genome->contigs[i].beginningLocation = GenomeLocation(contigStart);
-	    contigNameBuffer[n] = ' '; 
-	    n++; // increment n so we start copying at the position after the space
-	    contigSize = strlen(contigNameBuffer + n) - 1; //don't include the final \n
-        genome->contigs[i].name = new char[contigSize + 1];
-        genome->contigs[i].nameLength = (unsigned)contigSize;
-	    curName = genome->contigs[i].name;
-	    for (unsigned pos = 0; pos < contigSize; pos++) {
-	      curName[pos] = contigNameBuffer[pos + n];
-	    }
-        curName[contigSize] = '\0';
+        token = strtok(NULL, SEP);
+        if (token == NULL) goto err_contig_parse;
+        genome->contigs[i].name = new char[strlen(token) + 1];
+        genome->contigs[i].nameLength = (unsigned)strlen(token);
+        strcpy(genome->contigs[i].name, token);
+        token = strtok(NULL, SEP);
+        if (token == NULL) {
+            genome->contigs[i].isAlternate = false;
+            genome->contigs[i].isAlternateRC = false;
+            genome->contigs[i].liftedLocation = InvalidGenomeLocation;
+        } else {
+            int isAlternate;
+            if (1 != sscanf(token, "%d", &isAlternate)) {
+                goto err_contig_parse;
+            }
+            genome->contigs[i].isAlternate = isAlternate != 0;
+            int isAlternateRC;
+            if (token == NULL || 1 != sscanf(token, "%d", &isAlternateRC)) {
+                goto err_contig_parse;
+            }
+            genome->contigs[i].isAlternateRC = isAlternateRC != 0;
+            _int64 liftedLocation;
+            if (token == NULL || 1 != sscanf(token, "%lld", &liftedLocation)) {
+                goto err_contig_parse;
+            }
+            genome->contigs[i].liftedLocation = liftedLocation;
+
+            if (isAlternate && contigStart < genome->minAltLocation.location) {
+                genome->minAltLocation = contigStart;
+            }
+        }
+
     } // for each contig
 
     if (0 != loadFile->advance(GenomeLocationAsInt64(minLocation))) {
@@ -476,9 +499,13 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
         return;
     }
     bool error = false;
-    // build parent links from alt contigs
+    // build parent links from alt contigs, and find minAltLocation
+    minAltLocation = maxBases;
     for (int i = 0; i < nContigs; i++) {
         if (contigs[i].isAlternate) {
+            if (contigs[i].beginningLocation < minAltLocation) {
+                minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2;
+            }
             const char* parentName = altMap->getParentContigName(contigs[i].name);
             if (parentName == NULL) {
                 WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name);
@@ -505,43 +532,24 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
 
     // flip RC contigs
     for (int i = 0; i < nContigs; i++) {
-        if (contigs[i].isAlternate && contigs[i].isReverseStrand) {
+        if (contigs[i].isAlternate && contigs[i].isAlternateRC) {
             util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length);
         }
     }
 }
 
-char * Genome::findTerminator(char* lineBuffer, const char* pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator)
+GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const
 {
-    char* result = lineBuffer + strlen(lineBuffer);
-    if (NULL != pieceNameTerminatorCharacters) {
-        for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
-            char *terminator = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
-            if (NULL != terminator && terminator < result) {
-                result = terminator;
-            }
-        }
-    }
-    if (spaceIsAPieceNameTerminator) {
-        char *terminator = strchr(lineBuffer, ' ');
-        if (NULL != terminator && terminator < result) {
-            result = terminator;
-        }
-        terminator = strchr(lineBuffer, '\t');
-        if (NULL != terminator && terminator < result) {
-            result = terminator;
-        }
-    }
-    char *terminator = strchr(lineBuffer, '\n');
-    if (NULL != terminator) {
-        result = terminator;
+    if (minAltLocation < minAltLocation) {
+        return altLocation;
     }
-    terminator = strchr(lineBuffer, '\r');
-    if (NULL != terminator) {
-        result = terminator;
+    const Contig* alt = getContigAtLocation(altLocation);
+    if (alt == NULL) {
+        return altLocation;
     }
-    return result;
+    return alt->liftedLocation + (altLocation - alt->beginningLocation); // todo: padding??
 }
+
 const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const 
 {
     const Contig *contig = getContigAtLocation(location);
@@ -776,12 +784,12 @@ void AltContigMap::setAltContig(Genome::Contig* contig)
         StringAltContigMap::iterator alt = altsByAccession.find(accession->second);
         if (alt != altsByAccession.end()) {
             contig->isAlternate = true;
-            contig->isReverseStrand = alt->second.isRC;
+            contig->isAlternateRC = alt->second.isRC;
             return;
         }
     }
     contig->isAlternate = false;
-    contig->isReverseStrand = false;
+    contig->isAlternateRC = false;
 }
 
 const char* AltContigMap::getParentContigName(const char* altName)
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index f2ee8a67..4b6d926c 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -251,12 +251,12 @@ class Genome {
 
         struct Contig {
             Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL),
-                    isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation) {}
+            isAlternate(false), isAlternateRC(false), liftedLocation(InvalidGenomeLocation) {}
             GenomeLocation     beginningLocation;
             GenomeDistance     length;
 
             bool               isAlternate;
-            bool               isReverseStrand; // if reversed alternate strand
+            bool               isAlternateRC; // if reversed alternate strand
             GenomeLocation     liftedLocation; // location of beginning of alt contig mapping to primary
             
             unsigned           nameLength;
@@ -272,9 +272,9 @@ class Genome {
         const Contig *getNextContigAfterLocation(GenomeLocation location) const;
         int getContigNumAtLocation(GenomeLocation location) const;    // Returns the contig number, which runs from 0 .. getNumContigs() - 1.
 
-        inline bool hasAltContigs() const { return FALSE;  } // todo: implement
+        inline bool hasAltContigs() const { return minAltLocation < maxBases; }
 
-        GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation;  } // todo: implement
+        GenomeLocation getLiftedLocation(GenomeLocation altLocation) const;
 
 // unused        Genome *copy() const {return copy(true,true,true);}
 // unused        Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);}
@@ -286,8 +286,6 @@ class Genome {
         void    adjustAltContigs(AltContigMap* altMap);
         void    sortContigsByName();
 
-        static char* findTerminator(char* buffer, const char* terminators, bool whitespaceTerminator);
-
 private:
 
         static const int N_PADDING = 100; // Padding to add on either end of the genome to allow substring reads past it
@@ -301,6 +299,8 @@ class Genome {
         GenomeLocation       minLocation;
         GenomeLocation       maxLocation;
 
+        GenomeLocation       minAltLocation;
+
         //
         // A genome is made up of a bunch of contigs, typically chromosomes.  Contigs have names,
         // which are stored here.
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 8ca51603..717cb58c 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -487,8 +487,12 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     // We're done with the raw genome.  Delete it to save some memory.
     //
   
-    delete genome;
-    genome = NULL;
+    bool genomeHasAlts = genome->hasAltContigs();
+    if (! (genomeHasAlts && unliftedIndex == NULL)) {
+        // delete if we won't need it later
+        delete genome;
+        genome = NULL;
+    }
 
 	char *halfBuiltHashTableSpillFileName = NULL;
 
@@ -766,16 +770,21 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         return false;
     }
 
-    fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d", GenomeIndexFormatMajorVersion, GenomeIndexFormatMinorVersion, index->nHashTables, 
+    fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d",
+        // NOTE: this must be changed if the format no longer supports v5 (pre-alt)
+        genomeHasAlts ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts,
+        GenomeIndexFormatMinorVersion, index->nHashTables, 
         index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize); 
 
     fclose(indexFile);
  
-    if (genome->hasAltContigs() && unliftedIndex != NULL) {
+    if (genomeHasAlts && unliftedIndex == NULL) {
         // create a sub-index with only seeds that occur in alt contigs
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
         bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
             hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
+        delete genome;
+        genome = NULL;
         if (!ok) {
             WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
             soft_exit(1);
@@ -1697,7 +1706,7 @@ GenomeIndex::printBiasTables()
 }
 
         GenomeIndex *
-GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch)
+GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex)
 {
     int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), strlen(GenomeFileName)))) + 1);
     char *filenameBuffer = new char[filenameBufferSize];
@@ -1740,7 +1749,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch)
     indexFile->close();
     delete indexFile;
 
-    if (majorVersion != GenomeIndexFormatMajorVersion) {
+    if (majorVersion != GenomeIndexFormatMajorVersion && majorVersion != GenomeIndexFormatMajorVersionWithoutAlts) {
         WriteErrorMessage("This genome index appears to be from a different version of SNAP than this, and so we can't read it.  Index version %d, SNAP index format version %d\n",
             majorVersion, GenomeIndexFormatMajorVersion);
         soft_exit(1);
@@ -1920,22 +1929,38 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch)
 		blobFile = NULL;
 	}
 
-    snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
-    if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) {
-        WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n");
-        delete[] filenameBuffer;
-        delete index;
-        return NULL;
-    }
+    if (!liftedIndex) {
+        snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
+        if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) {
+            WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n");
+            delete[] filenameBuffer;
+            delete index;
+            return NULL;
+        }
 
-    if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) {
-        WriteErrorMessage("\nThis index has too many overflow entries to be valid.  Some early versions of SNAP\n"
-                        "allowed building indices with too small of a seed size, and this appears to be such\n"
-                        "an index.  You can no longer build indices like this, and you also can't use them\n"
-                        "because they are corrupt and would produce incorrect results.  Please use an index\n"
-                        "built with a larger seed size.  For hg19, the seed size must be at least 19.\n"
-                        "For other reference genomes this quantity will vary.\n");
-        soft_exit(1);
+        if ((_int64)index->genome->getCountOfBases() + (_int64)index->overflowTableSize > 0xfffffff0 && locationSize == 4) {
+            WriteErrorMessage("\nThis index has too many overflow entries to be valid.  Some early versions of SNAP\n"
+                "allowed building indices with too small of a seed size, and this appears to be such\n"
+                "an index.  You can no longer build indices like this, and you also can't use them\n"
+                "because they are corrupt and would produce incorrect results.  Please use an index\n"
+                "built with a larger seed size.  For hg19, the seed size must be at least 19.\n"
+                "For other reference genomes this quantity will vary.\n");
+            soft_exit(1);
+        }
+
+        if (index->genome->hasAltContigs()) {
+            snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
+            index->liftedIndex = loadFromDirectory(filenameBuffer, map, prefetch, true);
+            if (index->liftedIndex == NULL) {
+                WriteErrorMessage("Missing alt index directory %s\n", filenameBuffer);
+                soft_exit(1);
+            }
+            index->liftedIndex->genome = index->genome;
+        } else {
+            index->liftedIndex = NULL;
+        }
+    } else {
+        index->genome = NULL;
     }
 
     delete[] filenameBuffer;
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index 49eea33b..ca45201e 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -82,7 +82,7 @@ class GenomeIndex {
     //
     static void runIndexer(int argc, const char **argv);
 
-    static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch);
+    static GenomeIndex *loadFromDirectory(char *directoryName, bool map, bool prefetch, bool liftedIndex = false);
 
     static void printBiasTables();
 
@@ -175,9 +175,11 @@ class GenomeIndex {
     static SNAPHashTable** allocateHashTables(unsigned* o_nTables, GenomeDistance countOfBases, double slack,
         int seedLen, unsigned hashTableKeySize, bool large, unsigned locationSize, double* biasTable = NULL);
     
-    static const unsigned GenomeIndexFormatMajorVersion = 5;
+    static const unsigned GenomeIndexFormatMajorVersion = 6;
     static const unsigned GenomeIndexFormatMinorVersion = 0;
-    
+    // NOTE: this must be changed if the format no longer supports v5 (pre-alt)
+    static const unsigned GenomeIndexFormatMajorVersionWithoutAlts = 5;
+
     static const unsigned largestBiasTable = 32;    // Can't be bigger than the biggest seed size, which is set in Seed.h.  Bigger than 32 means a new Seed structure.
     static const unsigned largestKeySize = 8;
     static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1];

From a4b492deb8d3ef0e9e9a49842313001452e37f3c Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Mon, 7 Dec 2015 20:55:49 -0800
Subject: [PATCH 04/19] Alt test data generator

---
 tests/alttestgen.py | 125 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 125 insertions(+)
 create mode 100644 tests/alttestgen.py

diff --git a/tests/alttestgen.py b/tests/alttestgen.py
new file mode 100644
index 00000000..16769edc
--- /dev/null
+++ b/tests/alttestgen.py
@@ -0,0 +1,125 @@
+﻿# alttestgen.py
+#
+# generate data files for alt-contig test
+#
+# test is alttest.py
+#
+
+import sys
+import os
+import shutil
+import subprocess
+import random
+
+import pandas as pd
+
+BASES = "ACTG"
+RCBASES = {"A":"T", "T":"A", "C":"G", "G":"C"}
+
+def random_bases(n):
+    result = ""
+    for i in range(n):
+        result = result + random.choice(BASES)
+    return result
+
+def random_mutate(seq, p = 0.02):
+    for i in range(len(seq)):
+        if random.random() <= p:
+            b = "ACTG".find(seq[i:i+1])
+            seq = seq[:i] + random.choice(BASES[:b] + BASES[b+1:]) + seq[i + 1:]
+    return seq
+
+def rc(seq):
+    result = ""
+    for c in seq:
+        result = RCBASES[c] + result
+    return result
+
+class Read:
+    def __init__(self, id, chr, pos, seq, qual=None):
+        self.id = id
+        self.chr = chr
+        self.pos = pos
+        self.seq = seq
+        self.qual = qual
+
+    def __str__(self):
+        return "Read({}, {}, {}, {})".format(self.id, self.chr, self.pos, self.seq)
+
+    def to_sam_pair(self, other):
+        r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
+            self.id, 99, self.chr, self.pos, 60, len(self.seq), other.chr,
+            other.pos, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq))
+        return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
+            other.id, 147, other.chr, other.pos, 60, len(other.seq), self.chr,
+            self.pos, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq))
+
+class Contig:
+    def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False):
+        self.name = name
+        self.accession = accession
+        self.seq = seq
+        self.isAlt = isAlt
+        self.parent = parent
+        self.parentLoc = parentLoc
+        self.isAltRC = isAltRC
+
+    def __str__(self):
+        return "Contig({}, {}, {}, {}, {}, {}, {})".format(
+            self.name, self.accession, self.seq, 'alt' if self.isAlt else 'ref',
+            self.parent, self.parentLoc, 'rc' if self.isAltRC else '')
+
+class Genome:
+    def __init__(self, contigs={}):
+        self.contigs = contigs
+    
+    def add(self, contig):
+        self.contigs[contig.name] = contig
+
+    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.1):
+        pc = self.contigs[parent]
+        altseq = random_mutate(pc.seq[start:stop], pmut)
+        if (isRC):
+            altseq = rc(altseq)
+        self.add(Contig(name, accession, altseq, True, parent, start, isRC))
+
+    def get_seq(self, chr, start, end):
+        return self.contigs[chr].seq[start:end]
+
+    def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None):
+        if id == None:
+            id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos, ('r' if isRC else 'f'))
+        return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut))
+
+    def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
+        id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1, chr2, pos2)
+        r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1")
+        r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2")
+        return [r1, r2]
+
+    def write_fasta(self, filename):
+        with open(filename, 'w') as file:
+            for contig in self.contigs.values():
+                file.write(">{}|gb|{}\n".format(contig.name, contig.accession))
+                for i in range(0, len(contig.seq), 80):
+                    file.write("{}\n".format(contig.seq[i:i+80]))
+
+    def write_alts(self, filename):
+        with open(filename, 'w') as file:
+            file.write("#alt_scaf_acc\tparent_acc\tori\talt_scaf_start\talt_scaf_stop\tparent_start\tparent_stop\talt_start_tail\talt_stop_tail\n")
+            for contig in self.contigs.values():
+                if contig.isAlt:
+                    file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
+                        contig.accession, self.contigs[contig.parent].accession, '-' if contig.isAltRC else '+',
+                        1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0))
+
+g = Genome()
+g.add(Contig("chr1", "C01", random_bases(2000)))
+g.add_alt("chr1a", "C01A", "chr1", 500, 1500)
+g.write_fasta("test.fa")
+g.write_alts("test_alts.txt")
+
+with open("test.sam", "w") as file:
+    for i in range(0, 101, 10):
+        [r1, r2] = g.make_pair('chr1', 500 + i, 'chr1a', i)
+        file.write(r1.to_sam_pair(r2))

From 7e09023244279c0e6ec39427cb5ae5efcbe91339 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Tue, 8 Dec 2015 16:18:23 -0800
Subject: [PATCH 05/19] Simple test runs

---
 SNAPLib/BaseAligner.cpp                  | 42 +++-------
 SNAPLib/BaseAligner.h                    | 22 ++++++
 SNAPLib/Genome.cpp                       | 98 ++++++++++++++----------
 SNAPLib/Genome.h                         |  3 +-
 SNAPLib/GenomeIndex.cpp                  | 67 +++++++++++-----
 SNAPLib/IntersectingPairedEndAligner.cpp | 21 ++++-
 tests/alttestgen.py                      | 34 ++++----
 7 files changed, 179 insertions(+), 108 deletions(-)

diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
index 02efa6c0..39c3489f 100644
--- a/SNAPLib/BaseAligner.cpp
+++ b/SNAPLib/BaseAligner.cpp
@@ -44,27 +44,6 @@ using std::min;
 #define TRACE(...) {}
 #endif
 
-
-typedef struct MatchInfo
-{
-    GenomeLocation  location;
-    GenomeLocation  liftedLocation;
-    double          matchProbability;
-
-    MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) :
-        location(_loc), liftedLocation(_lifted), matchProbability(_p) {}
-} MatchInfo;
-
-bool
-matchInfoComparator(
-    const MatchInfo& a,
-    const MatchInfo& b)
-{
-    return a.liftedLocation < b.liftedLocation;
-}
-
-typedef VariableSizeVector<MatchInfo> MatchInfoVector;
-
 BaseAligner::BaseAligner(
     GenomeIndex    *i_genomeIndex,
     unsigned        i_maxHitsToConsider,
@@ -85,7 +64,7 @@ BaseAligner::BaseAligner(
         genomeIndex(i_genomeIndex), maxHitsToConsider(i_maxHitsToConsider), maxK(i_maxK),
         maxReadSize(i_maxReadSize), maxSeedsToUseFromCommandLine(i_maxSeedsToUseFromCommandLine),
         maxSeedCoverage(i_maxSeedCoverage), readId(-1), extraSearchDepth(i_extraSearchDepth),
-        explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), 
+        explorePopularSeeds(false), stopOnFirstHit(false), stats(i_stats), allMatches(NULL),
         noUkkonen(i_noUkkonen), noOrderedEvaluation(i_noOrderedEvaluation), noTruncation(i_noTruncation),
 		minWeightToCheck(max(1u, i_minWeightToCheck)), maxSecondaryAlignmentsPerContig(i_maxSecondaryAlignmentsPerContig)
 /*++
@@ -247,6 +226,9 @@ Routine Description:
     }
     hashTableEpoch = 0;
 
+    if (genome->hasAltContigs()) {
+        allMatches = new MatchInfoVector();
+    }
  
 }
 
@@ -677,8 +659,7 @@ Return Value:
     * Add up the highest-probability matches of all overlapping alternates
     */
     double
-computeLiftedCandidateProbability(
-    MatchInfoVector* allMatches,
+BaseAligner::computeLiftedCandidateProbability(
     GenomeDistance length)
 {
     std::sort(allMatches->begin(), allMatches->end(), matchInfoComparator);
@@ -796,10 +777,9 @@ Return Value:
 #endif
 
     unsigned weightListToCheck = highestUsedWeightList;
-    MatchInfoVector* allMatches = NULL;
-    bool anyAltMatches = FALSE;
-    if (genome->hasAltContigs()) {
-        allMatches = new MatchInfoVector();
+    bool anyAltMatches = false;
+    if (allMatches != NULL) {
+        allMatches->clear();
     }
 
     do {
@@ -822,7 +802,7 @@ Return Value:
                 if (bestScore <= maxK) {
                     primaryResult->location = bestScoreGenomeLocation;
                     if (anyAltMatches) {
-                        probabilityOfAllCandidates = computeLiftedCandidateProbability(allMatches, read[0]->getDataLength());
+                        probabilityOfAllCandidates = computeLiftedCandidateProbability(read[0]->getDataLength());
                     }
                     primaryResult->mapq = computeMAPQ(probabilityOfAllCandidates, probabilityOfBestCandidate, bestScore, popularSeedsSkipped);
                     if (primaryResult->mapq >= MAPQ_LIMIT_FOR_SINGLE_HIT) {
@@ -976,8 +956,8 @@ Return Value:
 
                             // remember in case there are alt matches
                             if (allMatches != NULL) {
-                                if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) {
-                                    anyAltMatches = TRUE;
+                                if ((! anyAltMatches) && genome->getLiftedLocation(genomeLocation) != genomeLocation) {
+                                    anyAltMatches = true;
                                 }
                                 allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability));
                             }
diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h
index 61a24cba..ee369a7c 100644
--- a/SNAPLib/BaseAligner.h
+++ b/SNAPLib/BaseAligner.h
@@ -34,6 +34,7 @@ Revision History:
 #include "AlignerStats.h"
 #include "directions.h"
 #include "GenomeIndex.h"
+#include "VariableSizeVector.h"
 
 extern bool doAlignerPrefetch;
 
@@ -326,6 +327,27 @@ class BaseAligner {
 
     AlignerStats *stats;
 
+    typedef struct MatchInfo
+    {
+        GenomeLocation  location;
+        GenomeLocation  liftedLocation;
+        double          matchProbability;
+
+        MatchInfo(GenomeLocation _loc, GenomeLocation _lifted, double _p) :
+            location(_loc), liftedLocation(_lifted), matchProbability(_p) {}
+    } MatchInfo;
+
+    static bool matchInfoComparator(const BaseAligner::MatchInfo& a, const BaseAligner::MatchInfo& b)
+    {
+        return a.liftedLocation < b.liftedLocation;
+    }
+
+    typedef VariableSizeVector<MatchInfo> MatchInfoVector;
+
+    MatchInfoVector* allMatches;
+
+    double computeLiftedCandidateProbability(GenomeDistance length);
+
     unsigned *hitCountByExtraSearchDepth;   // How many hits at each depth bigger than the current best edit distance.
                                             // So if the current best hit has edit distance 2, then hitCountByExtraSearchDepth[0] would
                                             // be the count of hits at edit distance 2, while hitCountByExtraSearchDepth[2] would be the count
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 2da7421a..06c350e1 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -265,18 +265,20 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
             }
             genome->contigs[i].isAlternate = isAlternate != 0;
             int isAlternateRC;
+            token = strtok(NULL, SEP);
             if (token == NULL || 1 != sscanf(token, "%d", &isAlternateRC)) {
                 goto err_contig_parse;
             }
             genome->contigs[i].isAlternateRC = isAlternateRC != 0;
             _int64 liftedLocation;
+            token = strtok(NULL, SEP);
             if (token == NULL || 1 != sscanf(token, "%lld", &liftedLocation)) {
                 goto err_contig_parse;
             }
             genome->contigs[i].liftedLocation = liftedLocation;
 
             if (isAlternate && contigStart < genome->minAltLocation.location) {
-                genome->minAltLocation = contigStart;
+                genome->minAltLocation = contigStart - chromosomePadding / 2;
             }
         }
 
@@ -309,6 +311,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
 	}
 	
 	genome->fillInContigLengths();
+    genome->adjustAltContigs(NULL);
     genome->sortContigsByName();
     delete[] contigNameBuffer;
     return genome;
@@ -495,39 +498,38 @@ void Genome::fillInContigLengths()
 
 void Genome::adjustAltContigs(AltContigMap* altMap)
 {
-    if (altMap == NULL) {
-        return;
-    }
-    bool error = false;
-    // build parent links from alt contigs, and find minAltLocation
-    minAltLocation = maxBases;
-    for (int i = 0; i < nContigs; i++) {
-        if (contigs[i].isAlternate) {
-            if (contigs[i].beginningLocation < minAltLocation) {
-                minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2;
-            }
-            const char* parentName = altMap->getParentContigName(contigs[i].name);
-            if (parentName == NULL) {
-                WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name);
-                error = true;
-                continue;
-            }
-            GenomeLocation parentLocation;
-            int parentIndex;
-            if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) {
-                WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name);
-                error = true;
-                continue;
-            }
-            if (contigs[parentIndex].isAlternate) {
-                WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName);
-                error = true; continue;
+    if (altMap != NULL) {
+        bool error = false;
+        // build parent links from alt contigs, and find minAltLocation
+        minAltLocation = maxBases;
+        for (int i = 0; i < nContigs; i++) {
+            if (contigs[i].isAlternate) {
+                if (contigs[i].beginningLocation < minAltLocation) {
+                    minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2;
+                }
+                const char* parentName = altMap->getParentContigName(contigs[i].name);
+                if (parentName == NULL) {
+                    WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name);
+                    error = true;
+                    continue;
+                }
+                GenomeLocation parentLocation;
+                int parentIndex;
+                if (!getLocationOfContig(parentName, &parentLocation, &parentIndex)) {
+                    WriteErrorMessage("Unable to find parent contig %s for alt contig %s\n", parentName, contigs[i].name);
+                    error = true;
+                    continue;
+                }
+                if (contigs[parentIndex].isAlternate) {
+                    WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName);
+                    error = true; continue;
+                }
+                contigs[i].liftedLocation = parentLocation;
             }
-            contigs[i].liftedLocation = parentLocation;
         }
-    }
-    if (error) {
-        soft_exit(1);
+        if (error) {
+            soft_exit(1);
+        }
     }
 
     // flip RC contigs
@@ -540,14 +542,14 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
 
 GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const
 {
-    if (minAltLocation < minAltLocation) {
+    if (altLocation < minAltLocation) {
         return altLocation;
     }
     const Contig* alt = getContigAtLocation(altLocation);
-    if (alt == NULL) {
+    if (alt == NULL || ! alt->isAlternate) {
         return altLocation;
     }
-    return alt->liftedLocation + (altLocation - alt->beginningLocation); // todo: padding??
+    return alt->liftedLocation + (altLocation - alt->beginningLocation);
 }
 
 const Genome::Contig *Genome::getContigForRead(GenomeLocation location, unsigned readLength, GenomeDistance *extraBasesClippedBefore) const 
@@ -655,7 +657,10 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
         soft_exit(1);
     }
     *q = '\0';
-    result->accessionFastaTag = p;
+    char * tag = (char*) malloc(q - p + 2);
+    strcpy(tag, p);
+    strcat(tag, "|");
+    result->accessionFastaTag = tag;
 
     // get names for each column type (last 2 are optional)
     p = q + 1;
@@ -689,7 +694,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
             WriteErrorMessage("Invalid file format for alt data in %s\n", filename);
             soft_exit(1);
         }
-        *q = '\0';
         for (int i = 0; i <= N_COLUMNS; i++) {
             if (i < N_COLUMNS && !strcmp(columnNames[i], p)) {
                 columnTypes.add(i);
@@ -711,6 +715,9 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
         AltContig alt;
         for (int columnIndex = 0; !endOfLine; columnIndex++) {
             q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile);
+            if (endOfFile) {
+                break;
+            }
             switch (columnTypes[columnIndex]) {
             case ALT_SCAF_ACC:
                 alt.accession = p;
@@ -747,7 +754,9 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
             }
             p = q;
         }
-        result->altsByAccession[alt.accession] = alt;
+        if (!endOfFile) {
+            result->altsByAccession[alt.accession] = alt;
+        }
     }
     return result;
 }
@@ -771,10 +780,16 @@ void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTermin
         q++;
     }
     char* accession = (char*)malloc(q - p);
-    memcpy(accession, p, q - p - 1);
+    memcpy(accession, p, q - p);
     *(accession + (q - p)) = '\0';
 
     nameToAccession[name] = accession;
+    accessionToName[accession] = name;
+
+    StringAltContigMap::iterator alt = altsByAccession.find(accession);
+    if (alt != altsByAccession.end()) {
+        alt->second.name = name;
+    }
 }
 
 void AltContigMap::setAltContig(Genome::Contig* contig)
@@ -798,7 +813,10 @@ const char* AltContigMap::getParentContigName(const char* altName)
     if (accession != nameToAccession.end()) {
         StringAltContigMap::iterator alt = altsByAccession.find(accession->second);
         if (alt != altsByAccession.end()) {
-            return alt->second.name;
+            StringMap::iterator parent = accessionToName.find(alt->second.parentAccession);
+            if (parent != accessionToName.end()) {
+                return parent->second.data();
+            }
         }
     }
     return NULL;
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index 4b6d926c..38c8d087 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -359,4 +359,5 @@ class AltContigMap
     StringAltContigMap altsByAccession;
     typedef std::map<std::string, std::string> StringMap;
     StringMap nameToAccession;
-};
\ No newline at end of file
+    StringMap accessionToName;
+};
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 717cb58c..fa110167 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -313,19 +313,22 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     int filenameBufferSize = (int)(strlen(directoryName) + 1 + __max(strlen(GenomeIndexFileName), __max(strlen(OverflowTableFileName), __max(strlen(GenomeIndexHashFileName), __max(strlen(GenomeFileName), strlen(LiftedIndexDirName))))) + 1);
     char *filenameBuffer = new char[filenameBufferSize];
     
-	fprintf(stderr,"Saving genome...");
-	_int64 start = timeInMillis();
-    snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
-    if (!genome->saveToFile(filenameBuffer)) {
-        WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n");
-        delete[] filenameBuffer;
-        return false;
+    _int64 start;
+    if (unliftedIndex == NULL) {
+        fprintf(stderr, "Saving genome...");
+        start = timeInMillis();
+        snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
+        if (!genome->saveToFile(filenameBuffer)) {
+            WriteErrorMessage("GenomeIndex::saveToDirectory: Failed to save the genome itself\n");
+            delete[] filenameBuffer;
+            return false;
+        }
+        fprintf(stderr, "%llds\n", (timeInMillis() + 500 - start) / 1000);
     }
-	fprintf(stderr,"%llds\n", (timeInMillis() + 500 - start) / 1000);
 
 	GenomeIndex *index = new GenomeIndex();
     index->genome = NULL;   // We always delete the index when we're done, but we delete the genome first to save space during the overflow table build.
-
+    
     GenomeDistance countOfBases = genome->getCountOfBases();
     if (locationSize != 8 && countOfBases > ((_int64) 1 << (locationSize*8)) - 16) {
         WriteErrorMessage("Genome is too big for %d byte genome locations.  Specify a larger location size with -locationSize\n", locationSize);
@@ -419,6 +422,12 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		}
 	}
 
+    index->seedLen = seedLen;
+    index->hashTableKeySize = hashTableKeySize;
+    index->largeHashTable = large;
+    index->locationSize = locationSize;
+    index->genome = genome;
+
     for (unsigned i = 0; i < nThreads; i++) {
 		threadContexts[i].whichThread = i;
 		threadContexts[i].nThreads = nThreads;
@@ -697,8 +706,10 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         }
         totalBytesWritten += bytesWrittenThisHashTable;
 
-		delete hashTables[whichHashTable];
-		hashTables[whichHashTable] = NULL;
+        if (!(genomeHasAlts && unliftedIndex == NULL)) {
+            delete hashTables[whichHashTable];
+            hashTables[whichHashTable] = NULL;
+        }
 	} // for each hash table
 
     fclose(tablesFile);
@@ -783,8 +794,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
         bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
             hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
-        delete genome;
-        genome = NULL;
         if (!ok) {
             WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
             soft_exit(1);
@@ -792,6 +801,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         }
     }
 
+    index->genome = NULL; // deleted earlier
     delete index;
     if (computeBias && biasTable != NULL) {
         delete[] biasTable;
@@ -895,7 +905,7 @@ SNAPHashTable** GenomeIndex::allocateHashTables(
 
 
 
-GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(FALSE), liftedIndex(NULL)
+GenomeIndex::GenomeIndex() : nHashTables(0), hashTables(NULL), overflowTable32(NULL), overflowTable64(NULL), genome(NULL), tablesBlob(NULL), mappedOverflowTable(NULL), mappedTables(NULL), hasAlts(false), liftedIndex(NULL)
 {
 }
 
@@ -1258,8 +1268,7 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
 
         if (!lift) {
             indexSeed(genomeLocation, seed, batches, context, &stats, large);
-        }
-        else {
+        } else {
             indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large);
         }
     } // For each genome base in our area
@@ -1355,7 +1364,27 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
     else {
         const unsigned *hits, *rcHits;
         context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
-        CHECK_ALTS_AND_ADD_LIFTED
+        // CHECK_ALTS_AND_ADD_LIFTED
+        if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) {
+            bool anyAlts = false;
+            for (int i = 0; i < nHits && !anyAlts; i++) {
+                anyAlts = genome->getLiftedLocation(hits[i]) != hits[i];
+            }
+            for (int i = 0; i < nRCHits && !anyAlts; i++) {
+                anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i];
+            }
+            if (anyAlts) {
+                for (int i = 0; i < nHits; i++) {
+                    indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large);
+                }
+                if (!seed.isOwnReverseComplement()) {
+                    Seed rcSeed = ~seed;
+                    for (int i = 0; i < nRCHits; i++) {
+                        indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large);
+                    }
+                }
+            }
+        }
     }
 }
 
@@ -1929,7 +1958,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo
 		blobFile = NULL;
 	}
 
-    if (!liftedIndex) {
+    if (liftedIndex == NULL) {
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
         if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) {
             WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n");
@@ -1956,10 +1985,12 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo
                 soft_exit(1);
             }
             index->liftedIndex->genome = index->genome;
+            index->hasAlts = true;
         } else {
             index->liftedIndex = NULL;
         }
     } else {
+        index->hasAlts = true;
         index->genome = NULL;
     }
 
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index c0c30b3e..a9ca597d 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -56,6 +56,7 @@ IntersectingPairedEndAligner::IntersectingPairedEndAligner(
     maxSecondaryAlignmentsPerContig(maxSecondaryAlignmentsPerContig_)
 {
     doesGenomeIndexHave64BitLocations = index->doesGenomeIndexHave64BitLocations();
+    doesGenomeIndexHaveAlts = index->getGenome()->hasAltContigs();
 
     unsigned maxSeedsToUse;
     if (0 != numSeedsFromCommandLine) {
@@ -670,7 +671,7 @@ IntersectingPairedEndAligner::align(
                         }
 #endif // _DEBUG
 
-                        _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore);
+                        // !! FIX THIS BEFORE CHECKIN !! _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore);
 
                         mate->scoreLimit = scoreLimit - fewerEndScore;
                     }
@@ -1275,8 +1276,8 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
 					anyFound = true;
                     mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
                     if (actualUnliftedGenomeLocationFound != NULL) {
-                        *actualUnliftedGenomeLocationFound = doesGenomeIndexHave64BitLocations
-                            ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe];
+                        *actualUnliftedGenomeLocationFound = (doesGenomeIndexHave64BitLocations
+                            ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]) - seedOffset;
                     }
                     *seedOffsetFound = seedOffset;
                 }
@@ -1353,6 +1354,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
     //
     GenomeLocation foundLocation = 0;
     bool anyFound = false;
+    const bool setUnlifted = unliftedGenomeLocation != NULL;
 
     //
     // Run through the lookups pushing up any that are at the most recently returned
@@ -1362,6 +1364,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
         _int64 *currentHitForIntersection;
         _int64 nHits;
         GenomeLocation hitLocation;
+        GenomeLocation unliftedHitLocation;
         unsigned seedOffset;
 
         //
@@ -1373,6 +1376,9 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
         seedOffset = lookups[i].seedOffset;                                                                             \
         if (nHits != *currentHitForIntersection) {                                                                      \
             hitLocation = lookups[i].hits[*currentHitForIntersection];                                                  \
+            if (setUnlifted) {                                                                                          \
+                unliftedHitLocation = lookups[i].unliftedHits[*currentHitForIntersection];                              \
+            }                                                                                                           \
         }
 
 
@@ -1392,8 +1398,14 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
             }
             if (doesGenomeIndexHave64BitLocations) {
                 hitLocation = lookups64[i].hits[*currentHitForIntersection];
+                if (setUnlifted) {
+                    unliftedHitLocation = lookups64[i].unliftedHits[*currentHitForIntersection];
+                }
             } else {
                 hitLocation = lookups32[i].hits[*currentHitForIntersection];
+                if (setUnlifted) {
+                    unliftedHitLocation = lookups32[i].unliftedHits[*currentHitForIntersection];
+                }
             }
         }
 
@@ -1402,6 +1414,9 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
                 hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome
             {
                 *genomeLocation = foundLocation = hitLocation - seedOffset;
+                if (setUnlifted) {
+                    *unliftedGenomeLocation = unliftedHitLocation - seedOffset;
+                }
                 *seedOffsetFound = seedOffset;
                 anyFound = true;
             }
diff --git a/tests/alttestgen.py b/tests/alttestgen.py
index 16769edc..8129565b 100644
--- a/tests/alttestgen.py
+++ b/tests/alttestgen.py
@@ -48,11 +48,11 @@ def __str__(self):
 
     def to_sam_pair(self, other):
         r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
-            self.id, 99, self.chr, self.pos, 60, len(self.seq), other.chr,
-            other.pos, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq))
+            self.id, 99, self.chr, self.pos + 1, 60, len(self.seq), other.chr,
+            other.pos + 1, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq))
         return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
-            other.id, 147, other.chr, other.pos, 60, len(other.seq), self.chr,
-            self.pos, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq))
+            other.id, 147, other.chr, other.pos + 1, 60, len(other.seq), self.chr,
+            self.pos + 1, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq))
 
 class Contig:
     def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False):
@@ -76,7 +76,7 @@ def __init__(self, contigs={}):
     def add(self, contig):
         self.contigs[contig.name] = contig
 
-    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.1):
+    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05):
         pc = self.contigs[parent]
         altseq = random_mutate(pc.seq[start:stop], pmut)
         if (isRC):
@@ -88,21 +88,23 @@ def get_seq(self, chr, start, end):
 
     def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None):
         if id == None:
-            id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos, ('r' if isRC else 'f'))
+            id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f'))
         return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut))
 
     def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
-        id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1, chr2, pos2)
+        id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1)
         r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1")
         r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2")
         return [r1, r2]
 
     def write_fasta(self, filename):
         with open(filename, 'w') as file:
-            for contig in self.contigs.values():
-                file.write(">{}|gb|{}\n".format(contig.name, contig.accession))
-                for i in range(0, len(contig.seq), 80):
-                    file.write("{}\n".format(contig.seq[i:i+80]))
+            for write_alts in [False, True]:
+                for contig in self.contigs.values():
+                    if contig.isAlt == write_alts:
+                        file.write(">{}|gb|{}\n".format(contig.name, contig.accession))
+                        for i in range(0, len(contig.seq), 80):
+                            file.write("{}\n".format(contig.seq[i:i+80]))
 
     def write_alts(self, filename):
         with open(filename, 'w') as file:
@@ -114,12 +116,14 @@ def write_alts(self, filename):
                         1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0))
 
 g = Genome()
-g.add(Contig("chr1", "C01", random_bases(2000)))
-g.add_alt("chr1a", "C01A", "chr1", 500, 1500)
+g.add(Contig("chr1", "C01", random_bases(3000)))
+g.add_alt("chr1a", "C01A", "chr1", 1000, 2000)
 g.write_fasta("test.fa")
 g.write_alts("test_alts.txt")
 
 with open("test.sam", "w") as file:
-    for i in range(0, 101, 10):
-        [r1, r2] = g.make_pair('chr1', 500 + i, 'chr1a', i)
+    for i in range(100, 201, 20):
+        [r1, r2] = g.make_pair('chr1', i, 'chr1a', i)
+        file.write(r1.to_sam_pair(r2))
+        [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000)
         file.write(r1.to_sam_pair(r2))

From cb7a567758f069eded69e2134ff65617216a55d7 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Wed, 9 Dec 2015 14:12:17 -0800
Subject: [PATCH 06/19] Handle minus strand alt contigs

---
 SNAPLib/Genome.cpp                       |  3 +-
 SNAPLib/IntersectingPairedEndAligner.cpp |  2 +-
 SNAPLib/SAM.cpp                          | 54 ++++++++++++++++++++----
 tests/alttestgen.py                      | 21 ++++++---
 4 files changed, 64 insertions(+), 16 deletions(-)

diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 06c350e1..42511534 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -311,7 +311,6 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
 	}
 	
 	genome->fillInContigLengths();
-    genome->adjustAltContigs(NULL);
     genome->sortContigsByName();
     delete[] contigNameBuffer;
     return genome;
@@ -535,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
     // flip RC contigs
     for (int i = 0; i < nContigs; i++) {
         if (contigs[i].isAlternate && contigs[i].isAlternateRC) {
-            util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length);
+            util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length - chromosomePadding);
         }
     }
 }
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index a9ca597d..45d2713b 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -630,7 +630,7 @@ IntersectingPairedEndAligner::align(
         scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation,
             candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset);
 
-        _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
+        // todo: fix _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
 
 #ifdef _DEBUG
         if (_DumpAlignments) {
diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp
index 19f32fcb..dfc7fb5e 100644
--- a/SNAPLib/SAM.cpp
+++ b/SNAPLib/SAM.cpp
@@ -1027,7 +1027,6 @@ SAMFormat::createSAMLine(
 {
     contigName = "*";
     positionInContig = 0;
-    const char *cigar = "*";
     templateLength = 0;
 
     if (secondaryAlignment) {
@@ -1092,6 +1091,7 @@ SAMFormat::createSAMLine(
         contigIndex = (int)(contig - genome->getContigs());
         positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based
         mapQuality = max(0, min(70, mapQuality));       // FIXME: manifest constant.
+
     } else {
         flags |= SAM_UNMAPPED;
         mapQuality = 0;
@@ -1228,13 +1228,17 @@ SAMFormat::writeRead(
     }
 
 	if (genomeLocation != InvalidGenomeLocation) {
-		cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize,
-			clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter, 
-			read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM,
-			&editDistance, o_addFrontClipping);
-		if (*o_addFrontClipping != 0) {
-			return false;
-		}
+        if (!context.genome->getContigs()[contigIndex].isAlternateRC) {
+            cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize,
+                clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter,
+                read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM,
+                &editDistance, o_addFrontClipping);
+            if (*o_addFrontClipping != 0) {
+                return false;
+            }
+        } else {
+
+        }
 	}
 
 
@@ -1300,6 +1304,19 @@ SAMFormat::writeRead(
             readGroupString = read->getReadGroup();
         }
     }
+    const Genome::Contig* contig = &context.genome->getContigs()[contigIndex];
+    if (contig->isAlternateRC) {
+        // contig was reverse-complemented when building index
+        // so reverse flags, adjust position; CIGAR string was reversed in computeCigar
+        flags ^= SAM_REVERSE_COMPLEMENT;
+        positionInContig = 1 + max(0, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength);
+    }
+    const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex];
+    if (mateContig->isAlternateRC) {
+        // same for mate
+        flags ^= SAM_NEXT_REVERSED;
+        matePositionInContig = 1 + max(0, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength);
+    }
     int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n",
         qnameLen, read->getId(),
         flags,
@@ -1393,6 +1410,17 @@ SAMFormat::computeCigar(
         return;
     }
 
+    if (contig->isAlternateRC) {
+        // the original reference was reverse-complemented on index build to simplify alignment
+        // so reverse-complement both reference and data for CIGAR string
+        char* dataBuf = (char*)alloca(dataLength);
+        util::toComplement(dataBuf, data, dataLength);
+        data = dataBuf;
+        char* referenceBuf = (char*)alloca(dataLength + MAX_K);
+        util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K);
+        reference = referenceBuf;
+    }
+
     *o_editDistance = lv->computeEditDistanceNormalized(
         reference,
         (int)(dataLength - *o_extraBasesClippedAfter + MAX_K), // Add space incase of indels.  We know there's enough, because the reference is padded.
@@ -1566,6 +1594,16 @@ SAMFormat::validateCigarString(
 		WriteErrorMessage("validateCigarString: read alignment location isn't in a chromosome, genomeLocation %lld\n", GenomeLocationAsInt64(genomeLocation));
 		soft_exit(1);
 	}
+    if (contig->isAlternateRC) {
+        // the original reference was reverse-complemented on index build to simplify alignment
+        // so reverse-complement both reference and data for CIGAR string
+        char* dataBuf = (char*)alloca(dataLength);
+        util::toComplement(dataBuf, data, dataLength);
+        data = dataBuf;
+        char* referenceBuf = (char*)alloca(dataLength + MAX_K);
+        util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K);
+        reference = referenceBuf;
+    }
 
 	if (genomeLocation >= contig->beginningLocation + contig->length - genome->getChromosomePadding()) {
 		WriteErrorMessage("validateCigarString: alignment location is in genome padding: %lld, contig name %s, base %lld, len %lld, padding size %d\n",
diff --git a/tests/alttestgen.py b/tests/alttestgen.py
index 8129565b..e8ce8420 100644
--- a/tests/alttestgen.py
+++ b/tests/alttestgen.py
@@ -84,12 +84,19 @@ def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05):
         self.add(Contig(name, accession, altseq, True, parent, start, isRC))
 
     def get_seq(self, chr, start, end):
-        return self.contigs[chr].seq[start:end]
+        contig = self.contigs[chr]
+        if not contig.isAltRC:
+            return contig.seq[start:end]
+        else:
+            return rc(contig.seq[len(contig.seq) - end : len(contig.seq) - start])
 
     def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None):
         if id == None:
             id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f'))
-        return Read(id, chr, pos, random_mutate(self.get_seq(chr, pos, pos + len), pmut))
+        seq = random_mutate(self.get_seq(chr, pos, pos + len))
+        if isRC:
+            seq = rc(seq)
+        return Read(id, chr, pos, seq, pmut)
 
     def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
         id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1)
@@ -116,14 +123,18 @@ def write_alts(self, filename):
                         1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0))
 
 g = Genome()
-g.add(Contig("chr1", "C01", random_bases(3000)))
+g.add(Contig("chr1", "C01", random_bases(5000)))
 g.add_alt("chr1a", "C01A", "chr1", 1000, 2000)
+g.add_alt("chr1b", "C01B", "chr1", 3000, 4000, True)
 g.write_fasta("test.fa")
 g.write_alts("test_alts.txt")
 
 with open("test.sam", "w") as file:
-    for i in range(100, 201, 20):
-        [r1, r2] = g.make_pair('chr1', i, 'chr1a', i)
+    for i in [100, 150, 200, 250, 2100, 2150, 2200, 2250]:
+        if i < 2000:
+            [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i)
+        else:
+            [r1, r2] = g.make_pair('chr1', i, 'chr1b' , i - 2000)
         file.write(r1.to_sam_pair(r2))
         [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000)
         file.write(r1.to_sam_pair(r2))

From 89a1f923858e8076ee40189cc5d324d664198ecb Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Wed, 9 Dec 2015 14:23:04 -0800
Subject: [PATCH 07/19] Allow extra columns in alt map

---
 SNAPLib/Genome.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 42511534..5bac7d14 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -717,7 +717,7 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
             if (endOfFile) {
                 break;
             }
-            switch (columnTypes[columnIndex]) {
+            switch (columnIndex < columnTypes.size() ? columnTypes[columnIndex] : N_COLUMNS) {
             case ALT_SCAF_ACC:
                 alt.accession = p;
                 break;

From 42717f9d6b863dc99b9825fbaae241d9facf452b Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Wed, 9 Dec 2015 15:31:58 -0800
Subject: [PATCH 08/19] Compile on Linux

---
 SNAPLib/BaseAligner.cpp | 4 ++--
 SNAPLib/Genome.h        | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
index 02efa6c0..d9fdef43 100644
--- a/SNAPLib/BaseAligner.cpp
+++ b/SNAPLib/BaseAligner.cpp
@@ -797,7 +797,7 @@ Return Value:
 
     unsigned weightListToCheck = highestUsedWeightList;
     MatchInfoVector* allMatches = NULL;
-    bool anyAltMatches = FALSE;
+    bool anyAltMatches = false;
     if (genome->hasAltContigs()) {
         allMatches = new MatchInfoVector();
     }
@@ -977,7 +977,7 @@ Return Value:
                             // remember in case there are alt matches
                             if (allMatches != NULL) {
                                 if ((! anyAltMatches) && genome->getContigAtLocation(genomeLocation)->isAlternate) {
-                                    anyAltMatches = TRUE;
+                                    anyAltMatches = true;
                                 }
                                 allMatches->push_back(MatchInfo(genomeLocation, genome->getLiftedLocation(genomeLocation), matchProbability));
                             }
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index d65d9008..fa593890 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -246,7 +246,7 @@ class Genome {
 
         struct Contig {
             Contig() : beginningLocation(InvalidGenomeLocation), length(0), nameLength(0), name(NULL),
-                    isAlternate(FALSE), isReverseStrand(FALSE), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {}
+                    isAlternate(false), isReverseStrand(false), liftedLocation(InvalidGenomeLocation), contextBefore(0), contextAfter(0) {}
             GenomeLocation     beginningLocation;
             GenomeDistance     length;
             bool               isAlternate;
@@ -267,7 +267,7 @@ class Genome {
         const Contig *getNextContigAfterLocation(GenomeLocation location) const;
         int getContigNumAtLocation(GenomeLocation location) const;    // Returns the contig number, which runs from 0 .. getNumContigs() - 1.
 
-        inline bool hasAltContigs() const { return FALSE;  } // todo: implement
+        inline bool hasAltContigs() const { return false;  } // todo: implement
 
         GenomeLocation getLiftedLocation(GenomeLocation altLocation) const { return altLocation;  } // todo: implement
 

From caa79174fcf57632ef8cc6985e8c80ba3eea511d Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Wed, 9 Dec 2015 16:17:50 -0800
Subject: [PATCH 09/19] Compile on Linux

---
 SNAPLib/Genome.cpp      | 12 ++++++------
 SNAPLib/GenomeIndex.cpp |  4 ++--
 SNAPLib/SAM.cpp         |  4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 5bac7d14..fee4a750 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -277,7 +277,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
             }
             genome->contigs[i].liftedLocation = liftedLocation;
 
-            if (isAlternate && contigStart < genome->minAltLocation.location) {
+            if (isAlternate && contigStart < (_int64)genome->minAltLocation) {
                 genome->minAltLocation = contigStart - chromosomePadding / 2;
             }
         }
@@ -534,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
     // flip RC contigs
     for (int i = 0; i < nContigs; i++) {
         if (contigs[i].isAlternate && contigs[i].isAlternateRC) {
-            util::toComplement(bases + contigs[i].beginningLocation.location, NULL, (int) contigs[i].length - chromosomePadding);
+	  util::toComplement(bases + (_int64)contigs[i].beginningLocation, NULL, (int) contigs[i].length - chromosomePadding);
         }
     }
 }
@@ -651,7 +651,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
     }
     char* q = strchr(p, ',');
     if (q == NULL) {
-err_invalid_column_spec:
         WriteErrorMessage("Invalid columns spec %s\n", columns);
         soft_exit(1);
     }
@@ -674,7 +673,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
         } else {
             q = p + strlen(p);
             if (i < PARENT_STOP) {
-                goto err_invalid_column_spec;
+		WriteErrorMessage("Invalid columns spec %s\n", columns);
+		soft_exit(1);
             }
             break;
         }
@@ -689,7 +689,6 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
     for (int columnIndex = 0; !endOfLine; columnIndex++) {
         q = tokenizeToNextTabOrNewline(p, &endOfLine, &endOfFile);
         if (q == NULL) {
-err_file_format:
             WriteErrorMessage("Invalid file format for alt data in %s\n", filename);
             soft_exit(1);
         }
@@ -706,7 +705,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
     }
     for (int i = 0; i < N_COLUMNS; i++) {
         if (columnNames[i] != NULL && !columnFound[i]) {
-            goto err_file_format;
+            WriteErrorMessage("Invalid file format for alt data in %s\n", filename);
+            soft_exit(1);
         }
     }
     while (!endOfFile) {
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index fa110167..e3b672f8 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -792,7 +792,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     if (genomeHasAlts && unliftedIndex == NULL) {
         // create a sub-index with only seeds that occur in alt contigs
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
-        bool ok = BuildIndexToDirectory(genome, seedLen, slack, TRUE, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
+        bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
             hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
         if (!ok) {
             WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
@@ -1958,7 +1958,7 @@ GenomeIndex::loadFromDirectory(char *directoryName, bool map, bool prefetch, boo
 		blobFile = NULL;
 	}
 
-    if (liftedIndex == NULL) {
+    if (!liftedIndex) {
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, GenomeFileName);
         if (NULL == (index->genome = Genome::loadFromFile(filenameBuffer, chromosomePadding, 0, 0, map))) {
             WriteErrorMessage("GenomeIndex::loadFromDirectory: Failed to load the genome itself\n");
diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp
index dfc7fb5e..7908aea1 100644
--- a/SNAPLib/SAM.cpp
+++ b/SNAPLib/SAM.cpp
@@ -1309,13 +1309,13 @@ SAMFormat::writeRead(
         // contig was reverse-complemented when building index
         // so reverse flags, adjust position; CIGAR string was reversed in computeCigar
         flags ^= SAM_REVERSE_COMPLEMENT;
-        positionInContig = 1 + max(0, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength);
+        positionInContig = 1 + max(0L, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength);
     }
     const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex];
     if (mateContig->isAlternateRC) {
         // same for mate
         flags ^= SAM_NEXT_REVERSED;
-        matePositionInContig = 1 + max(0, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength);
+        matePositionInContig = 1 + max(0L, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength);
     }
     int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n",
         qnameLen, read->getId(),

From 58069569cd4bf84da0c53bc3d4569a33a916de91 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Wed, 9 Dec 2015 18:03:48 -0800
Subject: [PATCH 10/19] Fix windows compile now

---
 SNAPLib/Genome.cpp | 4 ++--
 SNAPLib/SAM.cpp    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index fee4a750..39428568 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -277,7 +277,7 @@ Genome::loadFromFile(const char *fileName, unsigned chromosomePadding, GenomeLoc
             }
             genome->contigs[i].liftedLocation = liftedLocation;
 
-            if (isAlternate && contigStart < (_int64)genome->minAltLocation) {
+            if (isAlternate && contigStart < GenomeLocationAsInt64(genome->minAltLocation)) {
                 genome->minAltLocation = contigStart - chromosomePadding / 2;
             }
         }
@@ -534,7 +534,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
     // flip RC contigs
     for (int i = 0; i < nContigs; i++) {
         if (contigs[i].isAlternate && contigs[i].isAlternateRC) {
-	  util::toComplement(bases + (_int64)contigs[i].beginningLocation, NULL, (int) contigs[i].length - chromosomePadding);
+	  util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding);
         }
     }
 }
diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp
index 7908aea1..c2143529 100644
--- a/SNAPLib/SAM.cpp
+++ b/SNAPLib/SAM.cpp
@@ -1414,10 +1414,10 @@ SAMFormat::computeCigar(
         // the original reference was reverse-complemented on index build to simplify alignment
         // so reverse-complement both reference and data for CIGAR string
         char* dataBuf = (char*)alloca(dataLength);
-        util::toComplement(dataBuf, data, dataLength);
+        util::toComplement(dataBuf, data, (int)dataLength);
         data = dataBuf;
         char* referenceBuf = (char*)alloca(dataLength + MAX_K);
-        util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K);
+        util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K);
         reference = referenceBuf;
     }
 
@@ -1598,10 +1598,10 @@ SAMFormat::validateCigarString(
         // the original reference was reverse-complemented on index build to simplify alignment
         // so reverse-complement both reference and data for CIGAR string
         char* dataBuf = (char*)alloca(dataLength);
-        util::toComplement(dataBuf, data, dataLength);
+        util::toComplement(dataBuf, data, (int)dataLength);
         data = dataBuf;
         char* referenceBuf = (char*)alloca(dataLength + MAX_K);
-        util::toComplement(referenceBuf, reference - MAX_K, dataLength + MAX_K);
+        util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K);
         reference = referenceBuf;
     }
 

From 75b3361f22e9a8d69ac5ff16e6e61164d94309d4 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Sun, 13 Dec 2015 07:16:10 -0800
Subject: [PATCH 11/19] Fix alt index build

---
 SNAPLib/Genome.cpp      |  5 ++---
 SNAPLib/GenomeIndex.cpp | 29 +++++------------------------
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 39428568..2869008b 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -593,7 +593,6 @@ char* tokenizeToNextTabOrNewline(char* start, bool* endOfLine, bool* endOfFile)
         } else if (*p == '\r' || *p == '\n') {
             if (*(p + 1) != *p && (*(p + 1) == '\r' || *(p + 1) == '\n')) {
                 *p++ = '\0';
-            } else {
             }
             *p = '\0';
             *endOfLine = true;
@@ -694,11 +693,11 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
         }
         for (int i = 0; i <= N_COLUMNS; i++) {
             if (i < N_COLUMNS && !strcmp(columnNames[i], p)) {
-                columnTypes.add(i);
+                columnTypes.push_back(i);
                 columnFound[i] = true;
                 break;
             } else if (i == N_COLUMNS) {
-                columnTypes.add(N_COLUMNS); // ignore this column
+                columnTypes.push_back(N_COLUMNS); // ignore this column
             }
         }
         p = q;
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index e3b672f8..4db0fef1 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -1339,7 +1339,8 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
         GenomeLocation singleHit[2], singleRCHit[2];
         context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
 #define CHECK_ALTS_AND_ADD_LIFTED \
-        if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) { \
+        if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
+                (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \
             bool anyAlts = false; \
             for (int i = 0; i < nHits && ! anyAlts; i++) { \
                 anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \
@@ -1348,12 +1349,12 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
                 anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \
             } \
             if (anyAlts) { \
-                for (int i = 0; i < nHits && !anyAlts; i++) { \
+                for (int i = 0; i < nHits; i++) { \
                     indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large); \
                 } \
                 if (!seed.isOwnReverseComplement()) { \
                     Seed rcSeed = ~seed; \
-                    for (int i = 0; i < nRCHits && !anyAlts; i++) { \
+                    for (int i = 0; i < nRCHits; i++) { \
                         indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large); \
                     } \
                 } \
@@ -1364,27 +1365,7 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
     else {
         const unsigned *hits, *rcHits;
         context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
-        // CHECK_ALTS_AND_ADD_LIFTED
-        if ((nHits > 0 && genomeLocation == *hits) || (nHits == 0 && nRCHits > 0 && genomeLocation == *rcHits)) {
-            bool anyAlts = false;
-            for (int i = 0; i < nHits && !anyAlts; i++) {
-                anyAlts = genome->getLiftedLocation(hits[i]) != hits[i];
-            }
-            for (int i = 0; i < nRCHits && !anyAlts; i++) {
-                anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i];
-            }
-            if (anyAlts) {
-                for (int i = 0; i < nHits; i++) {
-                    indexSeed(genome->getLiftedLocation(hits[i]), seed, batches, context, stats, large);
-                }
-                if (!seed.isOwnReverseComplement()) {
-                    Seed rcSeed = ~seed;
-                    for (int i = 0; i < nRCHits; i++) {
-                        indexSeed(genome->getLiftedLocation(rcHits[i]), rcSeed, batches, context, stats, large);
-                    }
-                }
-            }
-        }
+        CHECK_ALTS_AND_ADD_LIFTED
     }
 }
 

From 237186c8b5af0cff8220c79a41a03eebeec1c3ac Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Fri, 8 Jan 2016 10:31:04 -0800
Subject: [PATCH 12/19] Coordinate sort of lifted/unlifted index; fix RC
 handling; secondary results

---
 SNAPLib/BaseAligner.cpp                  |   1 +
 SNAPLib/Genome.cpp                       |  12 +-
 SNAPLib/Genome.h                         |   5 +-
 SNAPLib/GenomeIndex.cpp                  | 146 +++++++++++++++++++----
 SNAPLib/GenomeIndex.h                    |   4 +-
 SNAPLib/IntersectingPairedEndAligner.cpp |  73 +++++++++---
 SNAPLib/IntersectingPairedEndAligner.h   |   3 +
 SNAPLib/SAM.cpp                          | 122 +++++++++----------
 tests/alttestgen.py                      |  48 ++++----
 9 files changed, 272 insertions(+), 142 deletions(-)

diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
index 39c3489f..cd6b72eb 100644
--- a/SNAPLib/BaseAligner.cpp
+++ b/SNAPLib/BaseAligner.cpp
@@ -227,6 +227,7 @@ Routine Description:
     hashTableEpoch = 0;
 
     if (genome->hasAltContigs()) {
+        // todo: BigAlloc / new(allocator) -> fixed size, avoid reallocs; reserve space for max size
         allMatches = new MatchInfoVector();
     }
  
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 2869008b..8a6a4fb4 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -506,7 +506,8 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
                 if (contigs[i].beginningLocation < minAltLocation) {
                     minAltLocation = contigs[i].beginningLocation - chromosomePadding / 2;
                 }
-                const char* parentName = altMap->getParentContigName(contigs[i].name);
+                GenomeDistance offset;
+                const char* parentName = altMap->getParentContigName(contigs[i].name, &offset);
                 if (parentName == NULL) {
                     WriteErrorMessage("Unable to find parent contig for alt contig %s\n", contigs[i].name);
                     error = true;
@@ -523,7 +524,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
                     WriteErrorMessage("Alt contig %s has alt parent contig %s, should be non-alt\n", contigs[i].name, parentName);
                     error = true; continue;
                 }
-                contigs[i].liftedLocation = parentLocation;
+                contigs[i].liftedLocation = parentLocation + offset;
             }
         }
         if (error) {
@@ -534,7 +535,7 @@ void Genome::adjustAltContigs(AltContigMap* altMap)
     // flip RC contigs
     for (int i = 0; i < nContigs; i++) {
         if (contigs[i].isAlternate && contigs[i].isAlternateRC) {
-	  util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding);
+	        util::toComplement(bases + GenomeLocationAsInt64(contigs[i].beginningLocation), NULL, (int) contigs[i].length - chromosomePadding);
         }
     }
 }
@@ -805,7 +806,7 @@ void AltContigMap::setAltContig(Genome::Contig* contig)
     contig->isAlternateRC = false;
 }
 
-const char* AltContigMap::getParentContigName(const char* altName)
+const char* AltContigMap::getParentContigName(const char* altName, GenomeDistance* pOffset)
 {
     StringMap::iterator accession = nameToAccession.find(altName);
     if (accession != nameToAccession.end()) {
@@ -813,6 +814,9 @@ const char* AltContigMap::getParentContigName(const char* altName)
         if (alt != altsByAccession.end()) {
             StringMap::iterator parent = accessionToName.find(alt->second.parentAccession);
             if (parent != accessionToName.end()) {
+                if (pOffset != NULL) {
+                    *pOffset = alt->second.parentStart - alt->second.start;
+                }
                 return parent->second.data();
             }
         }
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index 38c8d087..c01a3b61 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -276,6 +276,9 @@ class Genome {
 
         GenomeLocation getLiftedLocation(GenomeLocation altLocation) const;
 
+        inline bool isAltLocation(GenomeLocation location) const
+        { return location != InvalidGenomeLocation && location >= minAltLocation && getLiftedLocation(location) != location; }
+
 // unused        Genome *copy() const {return copy(true,true,true);}
 // unused        Genome *copyGenomeOneSex(bool useY, bool useM) const {return copy(!useY,useY,useM);}
 
@@ -337,7 +340,7 @@ class AltContigMap
 
     void setAltContig(Genome::Contig* contig);
 
-    const char* getParentContigName(const char* altName);
+    const char* getParentContigName(const char* altName, GenomeDistance* pOffset = NULL);
 
 private:
 
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 4db0fef1..4ae13022 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -387,9 +387,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     volatile _int64 nBasesProcessed = 0;
     volatile int runningThreadCount;
 
-    SingleWaiterObject doneObject;
-    CreateSingleWaiterObject(&doneObject);
-
     unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads);
     BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads];
 
@@ -398,6 +395,14 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         InitializeExclusiveLock(&hashTableLocks[i]);
     }
 
+    // lifted index needs to be done in two passes, first to build and then to sort
+    int liftedIndexPass = 0;
+
+lifted_index_pass_start:
+
+    SingleWaiterObject doneObject;
+    CreateSingleWaiterObject(&doneObject);
+
     runningThreadCount = nThreads;
 
     GenomeDistance nextChunkToProcess = 0;
@@ -459,6 +464,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		threadContexts[i].lastBackpointerIndexUsedByThread = lastBackpointerIndexUsedByThread;
 		threadContexts[i].backpointerSpillFile = backpointerSpillFile;
         threadContexts[i].unliftedIndex = unliftedIndex;
+        threadContexts[i].liftedIndexPass = liftedIndexPass;
 
         StartNewThread(BuildHashTablesWorkerThreadMain, &threadContexts[i]);
     }
@@ -496,8 +502,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     // We're done with the raw genome.  Delete it to save some memory.
     //
   
-    bool genomeHasAlts = genome->hasAltContigs();
-    if (! (genomeHasAlts && unliftedIndex == NULL)) {
+    if (!genome->hasAltContigs()) {
         // delete if we won't need it later
         delete genome;
         genome = NULL;
@@ -535,6 +540,10 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000);
 	}
 
+    if (unliftedIndex != NULL && liftedIndexPass == 1) {
+        goto lifted_skip_overflow;
+    }
+
     WriteStatusMessage("Building overflow table.\n");
     start = timeInMillis();
     fflush(stdout);
@@ -706,7 +715,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         }
         totalBytesWritten += bytesWrittenThisHashTable;
 
-        if (!(genomeHasAlts && unliftedIndex == NULL)) {
+        if (genome == NULL || !genome->hasAltContigs()) {
             delete hashTables[whichHashTable];
             hashTables[whichHashTable] = NULL;
         }
@@ -716,6 +725,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
     _ASSERT(overflowTableIndex == index->overflowTableSize);    // We used exactly what we expected to use.
 
+    if (unliftedIndex != NULL && liftedIndexPass == 0) {
+        liftedIndexPass = 1;
+        goto lifted_index_pass_start;
+    }
+
+lifted_skip_overflow:
+    
     delete overflowAnchor;
     overflowAnchor = NULL;
 
@@ -783,13 +799,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
     fprintf(indexFile,"%d %d %d %lld %d %d %d %lld %d %d",
         // NOTE: this must be changed if the format no longer supports v5 (pre-alt)
-        genomeHasAlts ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts,
+        genome != NULL && genome->hasAltContigs() ? GenomeIndexFormatMajorVersion : GenomeIndexFormatMajorVersionWithoutAlts,
         GenomeIndexFormatMinorVersion, index->nHashTables, 
         index->overflowTableSize, seedLen, chromosomePaddingSize, hashTableKeySize, totalBytesWritten, large ? 0 : 1, locationSize); 
 
     fclose(indexFile);
  
-    if (genomeHasAlts && unliftedIndex == NULL) {
+    if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) {
         // create a sub-index with only seeds that occur in alt contigs
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
         bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
@@ -1264,12 +1280,18 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
             continue;
         }
 
-		Seed seed(bases, seedLen);
+        Seed seed(bases, seedLen);
 
         if (!lift) {
             indexSeed(genomeLocation, seed, batches, context, &stats, large);
         } else {
-            indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large);
+            // in the lifted case, we first do one pass to index lifted seeds
+            // and then another pass to sort the unlifted locations by the lifted locations so they correspond
+            if (context->liftedIndexPass == 0) {
+                indexLiftedSeed(genomeLocation, seed, batches, context, &stats, large);
+            } else {
+                resortLiftedSeed(genomeLocation, seed, batches, context, &stats, large);
+            }
         }
     } // For each genome base in our area
 
@@ -1295,7 +1317,6 @@ GenomeIndex::BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context)
 
 const _int64 GenomeIndex::printPeriod = 100000000;
 
-
     void
 GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
 {
@@ -1308,28 +1329,27 @@ GenomeIndex::indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBat
     _ASSERT(whichHashTable < nHashTables);
  
 	if (batches[whichHashTable].addSeed(genomeLocation, seed.getLowBases(context->hashTableKeySize), usingComplement)) {
-		AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]);
-		for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) {
-			ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation, 
-				batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement,
-				&stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large);
-		}
-		ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]);
+        AcquireExclusiveLock(&context->hashTableLocks[whichHashTable]);
+        for (unsigned i = 0; i < batches[whichHashTable].nUsed; i++) {
+            ApplyHashTableUpdate(context, whichHashTable, batches[whichHashTable].entries[i].genomeLocation,
+                batches[whichHashTable].entries[i].lowBases, batches[whichHashTable].entries[i].usingComplement,
+                &stats->bothComplementsUsed, &stats->genomeLocationsInOverflowTable, &stats->seedsWithMultipleOccurrences, large);
+        }
+        ReleaseExclusiveLock(&context->hashTableLocks[whichHashTable]);
 
-		_int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds);
+        _int64 newNBasesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, batches[whichHashTable].nUsed + stats->unrecordedSkippedSeeds);
 
-		if ((unsigned)(newNBasesProcessed / printPeriod) > (unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) {
-			WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases());
-		}
-		stats->unrecordedSkippedSeeds = 0;
-		batches[whichHashTable].clear();
-	} // If we filled a batch
+        if ((unsigned)(newNBasesProcessed / printPeriod) >(unsigned)((newNBasesProcessed - batches[whichHashTable].nUsed - stats->unrecordedSkippedSeeds) / printPeriod)) {
+            WriteStatusMessage("Indexing %lld / %lld\n", (newNBasesProcessed / printPeriod) * printPeriod, context->genome->getCountOfBases());
+        }
+        stats->unrecordedSkippedSeeds = 0;
+        batches[whichHashTable].clear();
+    } // If we filled a batch
 }
 
     void
 GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
 {
-    // todo: optimize
     // if this is first occurrence of seed in unlifted index, checks if seed is in any alts
     // and if so, adds all locations to this index, lifting alts to non-alt locations
 
@@ -1368,6 +1388,80 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
         CHECK_ALTS_AND_ADD_LIFTED
     }
 }
+    
+    void
+dualSort32(
+    _int64 n,
+    unsigned* keys,
+    unsigned* values)
+{
+    // todo: optimize sorting, just using a simple selection sort for now
+    unsigned t;
+#define DUAL_SORT \
+    if (n < 2) { \
+        return; \
+    } \
+    for (_int64 i = 0; i < n - 1; i++) { \
+        for (_int64 j = n - 1; j > i; j--) { \
+            if (keys[i] > keys[j]) { \
+                t = keys[i]; \
+                keys[i] = keys[j]; \
+                keys[j] = t; \
+                t = values[i]; \
+                values[i] = values[j]; \
+                values[j] = t; \
+            } \
+        } \
+    }
+    DUAL_SORT
+}
+
+    void
+dualSort(
+    _int64 n,
+    GenomeLocation* keys,
+    GenomeLocation* values)
+{
+    GenomeLocation t;
+    DUAL_SORT
+}
+
+    void
+GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
+{
+    // redo lifting and then sort both lists by lifted location
+    // NOTE: this leaves the unlifted list in a non-sorted order
+
+    _int64 nHits, nRCHits;
+    _int64 nLiftedHits, nLiftedRCHits;
+    if (doesGenomeIndexHave64BitLocations()) {
+        const GenomeLocation *hits, *rcHits;
+        GenomeLocation singleHit[2], singleRCHit[2];
+        const GenomeLocation *liftedHits, *liftedRCHits;
+        GenomeLocation liftedSingleHit[2], liftedSingleRCHit[2];
+        lookupSeed(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits, &liftedSingleHit[1], &liftedSingleRCHit[1]);
+        if (nLiftedHits > 1 || nLiftedRCHits > 1) {
+            context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
+            _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits);
+            if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) {
+                dualSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits);
+                dualSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits);
+            }
+        }
+    } else {
+        const unsigned *hits, *rcHits;
+        const unsigned *liftedHits, *liftedRCHits;
+        lookupSeed32(seed, &nLiftedHits, &liftedHits, &nLiftedRCHits, &liftedRCHits);
+        if (nLiftedHits > 1 || nLiftedRCHits > 1) {
+            context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
+            _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits);
+            if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) {
+                dualSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits);
+                dualSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits);
+            }
+        }
+    }
+}
 
         void 
 GenomeIndex::ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index ca45201e..c0a45840 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -244,6 +244,7 @@ class GenomeIndex {
 
         // used for building sub-index of only seeds that occur in alt contigs
         GenomeIndex                     *unliftedIndex;
+        int                              liftedIndexPass;
 
         ExclusiveLock                   *hashTableLocks;
         ExclusiveLock                   *overflowTableLock;
@@ -294,8 +295,9 @@ class GenomeIndex {
 
     virtual void indexSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
     virtual void indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
+    virtual void resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
     virtual void completeIndexing(PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large);
-
+    
     static void BuildHashTablesWorkerThreadMain(void *param);
     void BuildHashTablesWorkerThread(BuildHashTablesThreadContext *context);
     static void ApplyHashTableUpdate(BuildHashTablesThreadContext *context, _uint64 whichHashTable, GenomeLocation genomeLocation, _uint64 lowBases, bool usingComplement,
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index 45d2713b..c1cc00ec 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -33,6 +33,9 @@ Revision History:
 extern bool _DumpAlignments;    // From BaseAligner.cpp
 #endif  // _DEBUG
 
+static const double EPSILON_FACTOR_HI = 1.0000000001;
+static const double EPSILON_FACTOR_LO = 0.9999999999;
+
 IntersectingPairedEndAligner::IntersectingPairedEndAligner(
         GenomeIndex  *index_,
         unsigned      maxReadSize_,
@@ -202,6 +205,7 @@ IntersectingPairedEndAligner::align(
     GenomeLocation bestResultGenomeLocation[NUM_READS_PER_PAIR];
     Direction bestResultDirection[NUM_READS_PER_PAIR];
     unsigned bestResultScore[NUM_READS_PER_PAIR];
+    bool bestPairHasAlts = false;
     unsigned popularSeedsSkipped[NUM_READS_PER_PAIR];
 
     reads[0][FORWARD] = read0;
@@ -630,7 +634,7 @@ IntersectingPairedEndAligner::align(
         scoreLocation(readWithFewerHits, setPairDirection[candidate->whichSetPair][readWithFewerHits], candidate->readWithFewerHitsUnliftedGenomeLocation,
             candidate->seedOffset, scoreLimit, &fewerEndScore, &fewerEndMatchProbability, &fewerEndGenomeLocationOffset);
 
-        // todo: fix _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
+        _ASSERT(-1 == fewerEndScore || fewerEndScore >= candidate->bestPossibleScore);
 
 #ifdef _DEBUG
         if (_DumpAlignments) {
@@ -671,7 +675,7 @@ IntersectingPairedEndAligner::align(
                         }
 #endif // _DEBUG
 
-                        // !! FIX THIS BEFORE CHECKIN !! _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore);
+                        _ASSERT(-1 == mate->score || mate->score >= mate->bestPossibleScore);
 
                         mate->scoreLimit = scoreLimit - fewerEndScore;
                     }
@@ -694,6 +698,7 @@ IntersectingPairedEndAligner::align(
                         // because it's a worse version of this location.
                         //
                         MergeAnchor *mergeAnchor = candidate->mergeAnchor;
+                        MergeAnchor *unliftedMergeAnchor = candidate->unliftedMergeAnchor;
 
                         if (NULL == mergeAnchor) {
                             //
@@ -707,6 +712,7 @@ IntersectingPairedEndAligner::align(
 
                                 if (mergeCandidate->mergeAnchor != NULL) {
                                     candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor;
+                                    candidate->unliftedMergeAnchor = mergeAnchor = mergeCandidate->unliftedMergeAnchor;
                                     break;
                                 }
                             }
@@ -716,10 +722,11 @@ IntersectingPairedEndAligner::align(
                                             mergeCandidate < scoringCandidatePool + lowestFreeScoringCandidatePoolEntry &&
                                             genomeLocationIsWithin(mergeCandidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, 50) &&
                                             mergeCandidate->whichSetPair == candidate->whichSetPair;
-                                            mergeCandidate--) {
+                                            mergeCandidate++) {
 
                                     if (mergeCandidate->mergeAnchor != NULL) {
                                         candidate->mergeAnchor = mergeAnchor = mergeCandidate->mergeAnchor;
+                                        candidate->unliftedMergeAnchor = unliftedMergeAnchor = mergeCandidate->unliftedMergeAnchor;
                                         break;
                                     }
                                 }
@@ -727,11 +734,12 @@ IntersectingPairedEndAligner::align(
                         }
 
                         bool merged;
+                        bool mergedUnlifted;
 
                         double oldPairProbability;
 
                         if (NULL == mergeAnchor) {
-                            if (firstFreeMergeAnchor >= mergeAnchorPoolSize) {
+                            if (firstFreeMergeAnchor >= mergeAnchorPoolSize - doesGenomeIndexHaveAlts) {
                                 WriteErrorMessage("Ran out of merge anchor pool entries.  Perhaps rerunning with a larger value of -mcp will help\n");
                                 soft_exit(1);
                             }
@@ -744,30 +752,45 @@ IntersectingPairedEndAligner::align(
                                 pairProbability, pairScore);
 
                             merged = false;
+                            mergedUnlifted = false;
                             oldPairProbability = 0;
                             candidate->mergeAnchor = mergeAnchor;
+                            if (doesGenomeIndexHaveAlts) {
+                                unliftedMergeAnchor = &mergeAnchorPool[firstFreeMergeAnchor];
+                                candidate->unliftedMergeAnchor = unliftedMergeAnchor;
+                                firstFreeMergeAnchor++;
+                                unliftedMergeAnchor->init(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset,
+                                    pairProbability, pairScore);
+                            }
                         } else {
                             merged = mergeAnchor->checkMerge(mate->readWithMoreHitsGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset,
                                 pairProbability, pairScore, doesGenomeIndexHaveAlts && (! candidate->isAlt()) && (!mate->isAlt()), &oldPairProbability);
+                            if (unliftedMergeAnchor != NULL) {
+                                double ignore;
+                                mergedUnlifted = merged && unliftedMergeAnchor->checkMerge(mate->readWithMoreHitsUnliftedGenomeLocation + mate->genomeOffset, candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset,
+                                    pairProbability, pairScore, false, &ignore);
+                            }
                         }
 
-                        if (!merged) {
+                        if (!(merged && mergedUnlifted)) {
                             //
                             // Back out the probability of the old match that we're merged with, if any.  The max
                             // is necessary because a + b - b is not necessarily a in floating point.  If there
                             // was no merge, the oldPairProbability is 0.
                             //
-                            probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability);
-
+                            if (!merged) {
+                                probabilityOfAllPairs = __max(0, probabilityOfAllPairs - oldPairProbability);
+                            }
                             bool isBestHit = false;
 
                             if (pairScore <= maxK && (pairScore < bestPairScore ||
-                                (pairScore == bestPairScore && (pairProbability > probabilityOfBestPair ||
-                                (pairProbability == probabilityOfBestPair && (! candidate->isAlt()) && (!mate->isAlt())))))) {
+                                (pairScore == bestPairScore && (pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_HI ||
+                                    (bestPairHasAlts && pairProbability >= probabilityOfBestPair*EPSILON_FACTOR_LO && (!candidate->isAlt()) && (!mate->isAlt())))))) {
                                 //
                                 // A new best hit.
                                 //
-                                if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= pairScore - bestPairScore) {
+                                // Code review note: was pairScore-bestPairScore which is negative int, i.e. very large unsigned, so would only save secondary w/equal score
+                                if (maxEditDistanceForSecondaryResults != -1 && (unsigned)maxEditDistanceForSecondaryResults >= bestPairScore - pairScore) {
                                     //
                                     // Move the old best to be a secondary alignment.  This won't happen on the first time we get a valid alignment,
                                     // because bestPairScore is initialized to be very large.
@@ -801,6 +824,7 @@ IntersectingPairedEndAligner::align(
                                 bestResultScore[readWithMoreHits] = mate->score;
                                 bestResultDirection[readWithFewerHits] = setPairDirection[candidate->whichSetPair][readWithFewerHits];
                                 bestResultDirection[readWithMoreHits] = setPairDirection[candidate->whichSetPair][readWithMoreHits];
+                                bestPairHasAlts = candidate->isAlt() || mate->isAlt();
 
                                 if (!noUkkonen) {
                                     scoreLimit = bestPairScore + extraSearchDepth;
@@ -833,7 +857,9 @@ IntersectingPairedEndAligner::align(
                                 }
                             }
 
-                            probabilityOfAllPairs += pairProbability;
+                            if (!merged) {
+                                probabilityOfAllPairs += pairProbability;
+                            }
     #ifdef  _DEBUG
                             if (_DumpAlignments) {
                                 printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n",
@@ -1272,14 +1298,21 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
             unsigned clause2 = probe == 0;
 
             if (clause1 && (clause2 || probeMinusOneHit > maxGenomeLocationToFindThisSeed)) {
-                if (probeHit - seedOffset > bestLocationFound) {
-					anyFound = true;
-                    mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
-                    if (actualUnliftedGenomeLocationFound != NULL) {
-                        *actualUnliftedGenomeLocationFound = (doesGenomeIndexHave64BitLocations
-                            ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe]) - seedOffset;
+                if (actualUnliftedGenomeLocationFound == NULL) {
+                    if (probeHit - seedOffset > bestLocationFound) {
+                        anyFound = true;
+                        mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
+                        *seedOffsetFound = seedOffset;
+                    }
+                } else {
+                    GenomeLocation bestUnliftedLocationFound = doesGenomeIndexHave64BitLocations ? lookups64[i].unliftedHits[probe] : lookups32[i].unliftedHits[probe];
+                    if (probeHit - seedOffset > bestLocationFound ||
+                        (probeHit - seedOffset == bestLocationFound && *actualUnliftedGenomeLocationFound != bestUnliftedLocationFound)) {
+                        anyFound = true;
+                        mostRecentLocationReturned = *actualGenomeLocationFound = bestLocationFound = probeHit - seedOffset;
+                        *actualUnliftedGenomeLocationFound = bestUnliftedLocationFound - seedOffset;
+                        *seedOffsetFound = seedOffset;
                     }
-                    *seedOffsetFound = seedOffset;
                 }
 
                 if (doesGenomeIndexHave64BitLocations) {
@@ -1449,7 +1482,9 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL
         // Within merge distance.  Keep the better score (or if they're tied the better match probability).
         //
         if (newPairScore < pairScore || (newPairScore == pairScore &&
-            (newMatchProbability > matchProbability || (newMatchProbability == matchProbability && newPairIsNonAlt)))) {
+            (newMatchProbability >= matchProbability*EPSILON_FACTOR_HI ||
+            (newMatchProbability >= matchProbability*EPSILON_FACTOR_LO && newPairIsNonAlt &&
+                (newMoreHitLocation != locationForReadWithMoreHits || newFewerHitLocation != locationForReadWithFewerHits))))) {
 #ifdef _DEBUG
             if (_DumpAlignments) {
                 printf("Merge replacement at anchor (%u, %u), loc (%u, %u), old match prob %e, new match prob %e, old pair score %d, new pair score %d\n",
diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h
index 403e28e6..39c258ef 100644
--- a/SNAPLib/IntersectingPairedEndAligner.h
+++ b/SNAPLib/IntersectingPairedEndAligner.h
@@ -407,6 +407,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         void init(GenomeLocation readWithMoreHitsGenomeLocation_, unsigned bestPossibleScore_, unsigned seedOffset_, GenomeLocation readWithMoreHitsUnliftedGenomeLocation_) {
             readWithMoreHitsGenomeLocation = readWithMoreHitsGenomeLocation_;
             readWithMoreHitsUnliftedGenomeLocation = readWithMoreHitsUnliftedGenomeLocation_;
+            _ASSERT(readWithMoreHitsUnliftedGenomeLocation != -1);
             bestPossibleScore = bestPossibleScore_;
             seedOffset = seedOffset_;
             score = -2;
@@ -420,6 +421,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
     struct ScoringCandidate {
         ScoringCandidate *      scoreListNext;              // This is a singly-linked list
         MergeAnchor *           mergeAnchor;
+        MergeAnchor *           unliftedMergeAnchor;
         unsigned                scoringMateCandidateIndex;  // Index into the array of scoring mate candidates where we should look 
         GenomeLocation          readWithFewerHitsGenomeLocation;
         GenomeLocation          readWithFewerHitsUnliftedGenomeLocation;
@@ -440,6 +442,7 @@ class IntersectingPairedEndAligner : public PairedEndAligner
             bestPossibleScore = bestPossibleScore_;
             scoreListNext = scoreListNext_;
             mergeAnchor = NULL;
+            unliftedMergeAnchor = NULL;
          }
         bool isAlt() const { return readWithFewerHitsGenomeLocation != readWithFewerHitsUnliftedGenomeLocation; }
     };
diff --git a/SNAPLib/SAM.cpp b/SNAPLib/SAM.cpp
index c2143529..0317f139 100644
--- a/SNAPLib/SAM.cpp
+++ b/SNAPLib/SAM.cpp
@@ -1062,23 +1062,8 @@ SAMFormat::createSAMLine(
         return false;
     }
 
-    if (direction == RC) {
-      for (unsigned i = 0; i < fullLength; i++) {
-        data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]];
-        quality[fullLength - 1 - i] = read->getUnclippedQuality()[i];
-      }
-      clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()];
-      basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength();
-      basesClippedAfter = read->getFrontClippedLength();
-    } else {
-      memcpy(data, read->getUnclippedData(), read->getUnclippedLength());
-      memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength());
-      clippedData = read->getData();
-      basesClippedBefore = read->getFrontClippedLength();
-      basesClippedAfter = fullLength - clippedLength - basesClippedBefore;
-    }
-
     int editDistance = -1;
+    const Genome::Contig* contig = NULL;
     if (genomeLocation != InvalidGenomeLocation) {
         if (direction == RC) {
             flags |= SAM_REVERSE_COMPLEMENT;
@@ -1092,12 +1077,35 @@ SAMFormat::createSAMLine(
         positionInContig = genomeLocation - contig->beginningLocation + 1; // SAM is 1-based
         mapQuality = max(0, min(70, mapQuality));       // FIXME: manifest constant.
 
+        if (contig->isAlternateRC) {
+            // contig was reverse-complemented when building index
+            flags ^= SAM_REVERSE_COMPLEMENT;
+            positionInContig = 1 + max(0L, (contig->length - genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength);
+            direction = direction == RC ? FORWARD : RC;
+        }
     } else {
         flags |= SAM_UNMAPPED;
         mapQuality = 0;
         *extraBasesClippedBefore = 0;
     }
 
+    if (direction == RC) {
+      for (unsigned i = 0; i < fullLength; i++) {
+        data[fullLength - 1 - i] = COMPLEMENT[read->getUnclippedData()[i]];
+        quality[fullLength - 1 - i] = read->getUnclippedQuality()[i];
+      }
+      clippedData = &data[fullLength - clippedLength - read->getFrontClippedLength()];
+      basesClippedBefore = fullLength - clippedLength - read->getFrontClippedLength();
+      basesClippedAfter = read->getFrontClippedLength();
+    } else {
+      memcpy(data, read->getUnclippedData(), read->getUnclippedLength());
+      memcpy(quality, read->getUnclippedQuality(), read->getUnclippedLength());
+      clippedData = read->getData();
+      basesClippedBefore = read->getFrontClippedLength();
+      basesClippedAfter = fullLength - clippedLength - basesClippedBefore;
+    }
+
+
     if (hasMate) {
         flags |= SAM_MULTI_SEGMENT;
         flags |= (firstInPair ? SAM_FIRST_SEGMENT : SAM_LAST_SEGMENT);
@@ -1112,6 +1120,11 @@ SAMFormat::createSAMLine(
             if (mateDirection == RC) {
                 flags |= SAM_NEXT_REVERSED;
             }
+            if (mateContig->isAlternateRC) {
+                // mate contig was reverse-complemented when building index
+                flags ^= SAM_NEXT_REVERSED;
+                matePositionInContig = 1 + max(0L, (mateContig->length - genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength);
+            }
 
             if (genomeLocation == InvalidGenomeLocation) {
                 //
@@ -1138,16 +1151,17 @@ SAMFormat::createSAMLine(
             if (alignedAsPair) {
                 flags |= SAM_ALL_ALIGNED;
             }
-            // Also compute the length of the whole paired-end string whose ends we saw. This is slightly
-            // tricky because (a) we may have clipped some bases before/after each end and (b) we need to
-            // give a signed result based on whether our read is first or second in the pair.
-            GenomeLocation myStart = genomeLocation - basesClippedBefore;
-            GenomeLocation myEnd = genomeLocation + clippedLength + basesClippedAfter;
-            _int64 mateBasesClippedBefore = mate->getFrontClippedLength();
-            _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore;
-            GenomeLocation mateStart = mateLocation - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore);
-            GenomeLocation mateEnd = mateLocation + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore);
-			if (contigName == matecontigName) { // pointer (not value) comparison, but that's OK.
+            // todo: should this look at lifted locations for alt contigs that map to same non-alt contig?
+            if (contigIndex == mateContigIndex) {
+                // Also compute the length of the whole paired-end string whose ends we saw. This is slightly
+                // tricky because (a) we may have clipped some bases before/after each end and (b) we need to
+                // give a signed result based on whether our read is first or second in the pair.
+                GenomeDistance myStart = positionInContig - basesClippedBefore;
+                GenomeDistance myEnd = positionInContig + clippedLength + basesClippedAfter;
+                _int64 mateBasesClippedBefore = mate->getFrontClippedLength();
+                _int64 mateBasesClippedAfter = mate->getUnclippedLength() - mate->getDataLength() - mateBasesClippedBefore;
+                GenomeDistance mateStart = matePositionInContig - (mateDirection == RC ? mateBasesClippedAfter : mateBasesClippedBefore);
+                GenomeDistance mateEnd = matePositionInContig + mate->getDataLength() + (mateDirection == FORWARD ? mateBasesClippedAfter : mateBasesClippedBefore);
 				if (myStart < mateStart) {
 					templateLength = mateEnd - myStart;
 				} else {
@@ -1228,16 +1242,12 @@ SAMFormat::writeRead(
     }
 
 	if (genomeLocation != InvalidGenomeLocation) {
-        if (!context.genome->getContigs()[contigIndex].isAlternateRC) {
-            cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize,
-                clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter,
-                read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM,
-                &editDistance, o_addFrontClipping);
-            if (*o_addFrontClipping != 0) {
-                return false;
-            }
-        } else {
-
+        cigar = computeCigarString(context.genome, lv, cigarBuf, cigarBufSize, cigarBufWithClipping, cigarBufWithClippingSize,
+            clippedData, clippedLength, basesClippedBefore, extraBasesClippedBefore, basesClippedAfter,
+            read->getOriginalFrontHardClipping(), read->getOriginalBackHardClipping(), genomeLocation, direction, useM,
+            &editDistance, o_addFrontClipping);
+        if (*o_addFrontClipping != 0) {
+            return false;
         }
 	}
 
@@ -1304,19 +1314,6 @@ SAMFormat::writeRead(
             readGroupString = read->getReadGroup();
         }
     }
-    const Genome::Contig* contig = &context.genome->getContigs()[contigIndex];
-    if (contig->isAlternateRC) {
-        // contig was reverse-complemented when building index
-        // so reverse flags, adjust position; CIGAR string was reversed in computeCigar
-        flags ^= SAM_REVERSE_COMPLEMENT;
-        positionInContig = 1 + max(0L, (contig->length - context.genome->getChromosomePadding() - positionInContig + 1) - (_int64)fullLength);
-    }
-    const Genome::Contig* mateContig = &context.genome->getContigs()[mateContigIndex];
-    if (mateContig->isAlternateRC) {
-        // same for mate
-        flags ^= SAM_NEXT_REVERSED;
-        matePositionInContig = 1 + max(0L, (mateContig->length - context.genome->getChromosomePadding() - matePositionInContig + 1) - (_int64)fullLength);
-    }
     int charsInString = snprintf(buffer, bufferSpace, "%.*s\t%d\t%s\t%u\t%d\t%s\t%s\t%u\t%lld\t%.*s\t%.*s%s%.*s%s%s\tPG:Z:SNAP%s%.*s\n",
         qnameLen, read->getId(),
         flags,
@@ -1388,6 +1385,16 @@ SAMFormat::computeCigar(
 
     const Genome::Contig *contig = genome->getContigAtLocation(genomeLocation);
 
+    const char *reference = genome->getSubstring(genomeLocation, dataLength);
+    if (contig->isAlternateRC) {
+        // the original reference was reverse-complemented on index build to simplify alignment
+        // so reverse-complement reference for CIGAR string
+        // data was already flipped in createSAMLine if needed
+        char* referenceBuf = (char*)alloca(dataLength + MAX_K);
+        util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K);
+        reference = referenceBuf;
+    }
+
     if (genomeLocation + dataLength > contig->beginningLocation + contig->length - genome->getChromosomePadding()) {
         //
         // The read hangs off the end of the contig.  Soft clip it at the end.  This is a tentative amount that assumes no net indels in the
@@ -1398,7 +1405,6 @@ SAMFormat::computeCigar(
         *o_extraBasesClippedAfter = 0;
     }
 
-    const char *reference = genome->getSubstring(genomeLocation, dataLength);
     if (NULL == reference) {
         //
         // Fell off the end of the contig.
@@ -1410,16 +1416,6 @@ SAMFormat::computeCigar(
         return;
     }
 
-    if (contig->isAlternateRC) {
-        // the original reference was reverse-complemented on index build to simplify alignment
-        // so reverse-complement both reference and data for CIGAR string
-        char* dataBuf = (char*)alloca(dataLength);
-        util::toComplement(dataBuf, data, (int)dataLength);
-        data = dataBuf;
-        char* referenceBuf = (char*)alloca(dataLength + MAX_K);
-        util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K);
-        reference = referenceBuf;
-    }
 
     *o_editDistance = lv->computeEditDistanceNormalized(
         reference,
@@ -1596,10 +1592,8 @@ SAMFormat::validateCigarString(
 	}
     if (contig->isAlternateRC) {
         // the original reference was reverse-complemented on index build to simplify alignment
-        // so reverse-complement both reference and data for CIGAR string
-        char* dataBuf = (char*)alloca(dataLength);
-        util::toComplement(dataBuf, data, (int)dataLength);
-        data = dataBuf;
+        // so reverse-complement reference for CIGAR string
+        // data was already flipped in createSAMLine if needed
         char* referenceBuf = (char*)alloca(dataLength + MAX_K);
         util::toComplement(referenceBuf, reference - MAX_K, (int)dataLength + MAX_K);
         reference = referenceBuf;
diff --git a/tests/alttestgen.py b/tests/alttestgen.py
index e8ce8420..20640293 100644
--- a/tests/alttestgen.py
+++ b/tests/alttestgen.py
@@ -46,14 +46,6 @@ def __init__(self, id, chr, pos, seq, qual=None):
     def __str__(self):
         return "Read({}, {}, {}, {})".format(self.id, self.chr, self.pos, self.seq)
 
-    def to_sam_pair(self, other):
-        r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
-            self.id, 99, self.chr, self.pos + 1, 60, len(self.seq), other.chr,
-            other.pos + 1, abs(self.pos - other.pos + len(other.seq)), self.seq, 'A'*len(self.seq))
-        return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
-            other.id, 147, other.chr, other.pos + 1, 60, len(other.seq), self.chr,
-            self.pos + 1, abs(self.pos - other.pos + len(other.seq)), other.seq, 'A'*len(other.seq))
-
 class Contig:
     def __init__(self, name, accession, seq, isAlt=False, parent=None, parentLoc = 0, isAltRC=False):
         self.name = name
@@ -83,35 +75,37 @@ def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05):
             altseq = rc(altseq)
         self.add(Contig(name, accession, altseq, True, parent, start, isRC))
 
-    def get_seq(self, chr, start, end):
-        contig = self.contigs[chr]
-        if not contig.isAltRC:
-            return contig.seq[start:end]
-        else:
-            return rc(contig.seq[len(contig.seq) - end : len(contig.seq) - start])
-
-    def make_read(self, chr, pos, isRC=False, len=100, pmut=.02, id=None):
+    def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None):
         if id == None:
-            id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isRC else 'f'))
-        seq = random_mutate(self.get_seq(chr, pos, pos + len))
-        if isRC:
-            seq = rc(seq)
-        return Read(id, chr, pos, seq, pmut)
+            id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f'))
+        seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut)
+        return Read(id, chr, pos, seq)
 
     def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
-        id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1+1, chr2, pos2+1)
+        id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1)
         r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1")
         r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2")
         return [r1, r2]
 
+    def to_sam_pair(self, read1, read2):
+        rc1 = 1 if self.contigs[read1.chr].isAltRC else 0
+        rc2 = 0 if self.contigs[read2.chr].isAltRC else 1
+        r1 = "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
+            read1.id, 67+16*rc1+32*rc2, read1.chr, read1.pos + 1, 60, len(read1.seq), read2.chr,
+            read2.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read1.seq, (['ABCD','DCBA'][rc1]*int(len(read1.seq)/4+1))[:len(read1.seq)])
+        return r1 + "{}\t{}\t{}\t{}\t{}\t{}M\t{}\t{}\t{}\t{}\t{}\n".format(
+            read2.id, 131+16*rc2+32*rc1, read2.chr, read2.pos + 1, 60, len(read2.seq), read1.chr,
+            read1.pos + 1, abs(read1.pos - read2.pos + len(read2.seq)), read2.seq, (['ABCD','DCBA'][rc2]*int(len(read2.seq)/4+1))[:len(read2.seq)])
+
     def write_fasta(self, filename):
         with open(filename, 'w') as file:
             for write_alts in [False, True]:
                 for contig in self.contigs.values():
                     if contig.isAlt == write_alts:
                         file.write(">{}|gb|{}\n".format(contig.name, contig.accession))
-                        for i in range(0, len(contig.seq), 80):
-                            file.write("{}\n".format(contig.seq[i:i+80]))
+                        LINE_LEN=100
+                        for i in range(0, len(contig.seq), LINE_LEN):
+                            file.write("{}\n".format(contig.seq[i:i+LINE_LEN]))
 
     def write_alts(self, filename):
         with open(filename, 'w') as file:
@@ -134,7 +128,7 @@ def write_alts(self, filename):
         if i < 2000:
             [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i)
         else:
-            [r1, r2] = g.make_pair('chr1', i, 'chr1b' , i - 2000)
-        file.write(r1.to_sam_pair(r2))
+            [r1, r2] = g.make_pair('chr1', i, 'chr1b' , 900 - (i - 2000))
+        file.write(g.to_sam_pair(r1,r2))
         [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000)
-        file.write(r1.to_sam_pair(r2))
+        file.write(g.to_sam_pair(r1,r2))

From 68f90d2a7c0c4f2b3c08db7f393e4efb1dd834b0 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Fri, 8 Jan 2016 15:45:45 -0800
Subject: [PATCH 13/19] Fix lifted index sort

---
 SNAPLib/GenomeIndex.cpp                  | 85 +++++++++++++++++-------
 SNAPLib/IntersectingPairedEndAligner.cpp | 46 ++++++++-----
 tests/alttestgen.py                      | 40 +++++++----
 3 files changed, 116 insertions(+), 55 deletions(-)

diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 4ae13022..a257c3c0 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -747,6 +747,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
         delete [] histogram;
     }
 
+    if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) {
+        // create a sub-index with only seeds that occur in alt contigs
+        // need to build lifted index here because it will reorder unlifted index overflow table
+        snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
+        bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
+            hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
+        if (!ok) {
+            WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
+            soft_exit(1);
+            return false;
+        }
+    }
+
     //
     // Now save out the part of the index that's independent of the genome itself.
     //
@@ -805,18 +818,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
     fclose(indexFile);
  
-    if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) {
-        // create a sub-index with only seeds that occur in alt contigs
-        snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
-        bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
-            hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
-        if (!ok) {
-            WriteErrorMessage("Failed to build lifted index %s\n", filenameBuffer);
-            soft_exit(1);
-            return false;
-        }
-    }
-
     index->genome = NULL; // deleted earlier
     delete index;
     if (computeBias && biasTable != NULL) {
@@ -1388,22 +1389,23 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
         CHECK_ALTS_AND_ADD_LIFTED
     }
 }
-    
+#undef CHECK_ALTS_AND_ADD_LIFTED
+
     void
-dualSort32(
+dualBackwardsSort32(
     _int64 n,
     unsigned* keys,
     unsigned* values)
 {
     // todo: optimize sorting, just using a simple selection sort for now
     unsigned t;
-#define DUAL_SORT \
+#define DUAL_BACKWARDS_SORT \
     if (n < 2) { \
         return; \
     } \
     for (_int64 i = 0; i < n - 1; i++) { \
         for (_int64 j = n - 1; j > i; j--) { \
-            if (keys[i] > keys[j]) { \
+            if (keys[i] < keys[j]) { \
                 t = keys[i]; \
                 keys[i] = keys[j]; \
                 keys[j] = t; \
@@ -1413,18 +1415,19 @@ dualSort32(
             } \
         } \
     }
-    DUAL_SORT
+    DUAL_BACKWARDS_SORT
 }
 
     void
-dualSort(
+dualBackwardsSort(
     _int64 n,
     GenomeLocation* keys,
     GenomeLocation* values)
 {
     GenomeLocation t;
-    DUAL_SORT
+    DUAL_BACKWARDS_SORT
 }
+#undef DUAL_BACKWARDS_SORT
 
     void
 GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTableBatch *batches, BuildHashTablesThreadContext *context, IndexBuildStats *stats, bool large)
@@ -1443,9 +1446,26 @@ GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashT
         if (nLiftedHits > 1 || nLiftedRCHits > 1) {
             context->unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
             _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits);
-            if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) {
-                dualSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits);
-                dualSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits);
+            if ((nHits > 1 && genomeLocation == hits[0]) || (nRCHits > 1 && genomeLocation == rcHits[0])) {
+                // re-lift unlifted so that the order corresponds, then sort both by lifted location
+                for (int i = 0; i < nHits; i++) {
+                    ((GenomeLocation*)liftedHits)[i] = genome->getLiftedLocation(hits[i]);
+                }
+                for (int i = 0; i < nRCHits; i++) {
+                    ((GenomeLocation*)liftedRCHits)[i] = genome->getLiftedLocation(rcHits[i]);
+                }
+                dualBackwardsSort(nHits, (GenomeLocation*)liftedHits, (GenomeLocation*)hits);
+                dualBackwardsSort(nRCHits, (GenomeLocation*)liftedRCHits, (GenomeLocation*)rcHits);
+#ifdef _DEBUG
+                for (int i = 0; i < nHits; i++) {
+                    _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]);
+                    _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]);
+                }
+                for (int i = 0; i < nRCHits; i++) {
+                    _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]);
+                    _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]);
+                }
+#endif
             }
         }
     } else {
@@ -1456,8 +1476,25 @@ GenomeIndex::resortLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashT
             context->unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
             _ASSERT(nLiftedHits == nHits && nLiftedRCHits == nRCHits);
             if ((nHits > 0 && genomeLocation == hits[0]) || (nRCHits > 0 && genomeLocation == rcHits[0])) {
-                dualSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits);
-                dualSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits);
+                // re-lift unlifted so that the order corresponds, then sort both by lifted location
+                for (int i = 0; i < nHits; i++) {
+                    ((unsigned*)liftedHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(hits[i]));
+                }
+                for (int i = 0; i < nRCHits; i++) {
+                    ((unsigned*)liftedRCHits)[i] = GenomeLocationAsInt32(genome->getLiftedLocation(rcHits[i]));
+                }
+                dualBackwardsSort32(nHits, (unsigned*)liftedHits, (unsigned*)hits);
+                dualBackwardsSort32(nRCHits, (unsigned*)liftedRCHits, (unsigned*)rcHits);
+#ifdef _DEBUG
+                for (int i = 0; i < nHits; i++) {
+                    _ASSERT(genome->getLiftedLocation(hits[i]) == liftedHits[i]);
+                    _ASSERT(i == 0 || liftedHits[i - 1] >= liftedHits[i]);
+                }
+                for (int i = 0; i < nRCHits; i++) {
+                    _ASSERT(genome->getLiftedLocation(rcHits[i]) == liftedRCHits[i]);
+                    _ASSERT(i == 0 || liftedRCHits[i - 1] >= liftedRCHits[i]);
+                }
+#endif
             }
         }
     }
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index c1cc00ec..92f73883 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -186,7 +186,7 @@ IntersectingPairedEndAligner::align(
 
 #ifdef  _DEBUG
     if (_DumpAlignments) {
-        printf("\nIntersectingAligner aligning reads '%*.s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData());
+        printf("\nIntersectingAligner aligning reads '%.*s' and '%.*s' with data '%.*s' and '%.*s'\n", read0->getIdLength(), read0->getId(), read1->getIdLength(), read1->getId(), read0->getDataLength(), read0->getData(), read1->getDataLength(), read1->getData());
     }
 #endif  // _DEBUG
 
@@ -434,8 +434,8 @@ IntersectingPairedEndAligner::align(
         GenomeLocation      lastGenomeLocationForReadWithMoreHits;
         GenomeLocation      lastUnliftedGenomeLocationForReadWithFewerHits;
         GenomeLocation      lastUnliftedGenomeLocationForReadWithMoreHits;
-        GenomeLocation     *pLastGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL;
-        GenomeLocation     *pLastGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL;
+        GenomeLocation     *pLastUnliftedGenomeLocationForReadWithFewerHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithFewerHits : NULL;
+        GenomeLocation     *pLastUnliftedGenomeLocationForReadWithMoreHits = doesGenomeIndexHaveAlts ? &lastUnliftedGenomeLocationForReadWithMoreHits : NULL;
         unsigned            lastSeedOffsetForReadWithMoreHits;
 
         bool                outOfMoreHitsLocations = false;
@@ -443,13 +443,13 @@ IntersectingPairedEndAligner::align(
         //
         // Seed the intersection state by doing a first lookup.
         //
-        if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
+        if (setPair[readWithFewerHits]->getFirstHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) {
             //
             // No hits in this direction.
             //
             continue;   // The outer loop over set pairs.
         }
-
+        _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits);
         lastGenomeLocationForReadWithMoreHits = InvalidGenomeLocation;
 
         //
@@ -471,9 +471,10 @@ IntersectingPairedEndAligner::align(
                 // location that's not too high.
                 //
                 if (!setPair[readWithMoreHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithFewerHits + maxSpacing,
-                    &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) {
+                    &lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) {
                     break;  // End of all of the mates.  We're done with this set pair.
                 }
+                _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits);
             }
 
             if ((lastGenomeLocationForReadWithMoreHits + maxSpacing < lastGenomeLocationForReadWithFewerHits || outOfMoreHitsLocations) &&
@@ -490,12 +491,13 @@ IntersectingPairedEndAligner::align(
                 }
 
                 if (!setPair[readWithFewerHits]->getNextHitLessThanOrEqualTo(lastGenomeLocationForReadWithMoreHits + maxSpacing, &lastGenomeLocationForReadWithFewerHits,
-                    &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
+                    &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) {
                     //
                     // No more candidates on the read with fewer hits side.  We're done with this set pair.
                     //
                     break;
                 }
+                _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits);
                 continue;
             }
 
@@ -522,8 +524,8 @@ IntersectingPairedEndAligner::align(
 
 #ifdef _DEBUG
                 if (_DumpAlignments) {
-                    printf("SetPair %d, added more hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n",
-                            whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits,
+                    printf("SetPair %d, added more hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n",
+                            whichSetPair, lowestFreeScoringMateCandidate[whichSetPair], lastGenomeLocationForReadWithMoreHits, lastUnliftedGenomeLocationForReadWithMoreHits,
                             bestPossibleScoreForReadWithMoreHits,
                             lastSeedOffsetForReadWithMoreHits);
                 }
@@ -533,11 +535,12 @@ IntersectingPairedEndAligner::align(
 
                 previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits;
 
-                if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastGenomeLocationForReadWithMoreHits)) {
+                if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) {
                     lastGenomeLocationForReadWithMoreHits = 0;
                     outOfMoreHitsLocations = true;
                     break; // out of the loop looking for candidates on the more hits side.
                 }
+                _ASSERT(pLastUnliftedGenomeLocationForReadWithMoreHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithMoreHits) == lastGenomeLocationForReadWithMoreHits);
             }
 
             //
@@ -586,8 +589,8 @@ IntersectingPairedEndAligner::align(
 
 #ifdef _DEBUG
                 if (_DumpAlignments) {
-                    printf("SetPair %d, added fewer hits candidate %d at genome location %u, bestPossibleScore %d, seedOffset %d\n",
-                            whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits,
+                    printf("SetPair %d, added fewer hits candidate %d at genome location %u(%u), bestPossibleScore %d, seedOffset %d\n",
+                            whichSetPair, lowestFreeScoringCandidatePoolEntry, lastGenomeLocationForReadWithFewerHits, lastUnliftedGenomeLocationForReadWithFewerHits,
                             lowestBestPossibleScoreOfAnyPossibleMate + bestPossibleScoreForReadWithFewerHits,
                             lastSeedOffsetForReadWithFewerHits);
                 }
@@ -597,9 +600,10 @@ IntersectingPairedEndAligner::align(
                 maxUsedBestPossibleScoreList = max(maxUsedBestPossibleScoreList, bestPossibleScore);
             }
 
-            if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastGenomeLocationForReadWithFewerHits)) {
+            if (!setPair[readWithFewerHits]->getNextLowerHit(&lastGenomeLocationForReadWithFewerHits, &lastSeedOffsetForReadWithFewerHits, pLastUnliftedGenomeLocationForReadWithFewerHits)) {
                 break;
             }
+            _ASSERT(pLastUnliftedGenomeLocationForReadWithFewerHits == NULL || genome->getLiftedLocation(*pLastUnliftedGenomeLocationForReadWithFewerHits) == lastGenomeLocationForReadWithFewerHits);
         }
     } // For each set pair
 
@@ -638,8 +642,8 @@ IntersectingPairedEndAligner::align(
 
 #ifdef _DEBUG
         if (_DumpAlignments) {
-            printf("Scored fewer end candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool),
-                candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->seedOffset,
+            printf("Scored fewer end candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n", (int)(candidate - scoringCandidatePool),
+                candidate->whichSetPair, readWithFewerHits, candidate->readWithFewerHitsGenomeLocation, candidate->readWithFewerHitsUnliftedGenomeLocation, candidate->seedOffset,
                 scoreLimit, fewerEndScore, fewerEndGenomeLocationOffset);
         }
 #endif // DEBUG
@@ -669,8 +673,8 @@ IntersectingPairedEndAligner::align(
                             &mate->genomeOffset);
 #ifdef _DEBUG
                         if (_DumpAlignments) {
-                            printf("Scored mate candidate %d, set pair %d, read %d, location %u, seed offset %d, score limit %d, score %d, offset %d\n",
-                                (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation,
+                            printf("Scored mate candidate %d, set pair %d, read %d, location %u(%u), seed offset %d, score limit %d, score %d, offset %d\n",
+                                (int)(mate - scoringMateCandidates[candidate->whichSetPair]), candidate->whichSetPair, readWithMoreHits, mate->readWithMoreHitsGenomeLocation, mate->readWithMoreHitsUnliftedGenomeLocation,
                                 mate->seedOffset, scoreLimit - fewerEndScore, mate->score, mate->genomeOffset);
                         }
 #endif // _DEBUG
@@ -862,9 +866,10 @@ IntersectingPairedEndAligner::align(
                             }
     #ifdef  _DEBUG
                             if (_DumpAlignments) {
-                                printf("Added %e (= %e * %e) @ (%u, %u), giving new probability of all pairs %e, score %d = %d + %d%s\n",
+                                printf("Added %e (= %e * %e) @ (%u, %u)((%u, %u)), giving new probability of all pairs %e, score %d = %d + %d%s\n",
                                     pairProbability, mate->matchProbability , fewerEndMatchProbability,
                                     candidate->readWithFewerHitsGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsGenomeLocation + mate->genomeOffset,
+                                    candidate->readWithFewerHitsUnliftedGenomeLocation + fewerEndGenomeLocationOffset, mate->readWithMoreHitsUnliftedGenomeLocation+ mate->genomeOffset,
                                     probabilityOfAllPairs,
                                     pairScore, fewerEndScore, mate->score, isBestHit ? " New best hit" : "");
                             }
@@ -1476,6 +1481,11 @@ IntersectingPairedEndAligner::MergeAnchor::checkMerge(GenomeLocation newMoreHitL
         matchProbability = newMatchProbability;
         pairScore = newPairScore;
         *oldMatchProbability = 0.0;
+#ifdef _DEBUG
+        if (_DumpAlignments) {
+            printf("New anchor loc (%u, %u)\n", newMoreHitLocation, newFewerHitLocation);
+        }
+#endif
         return false;
     }  else {
         //
diff --git a/tests/alttestgen.py b/tests/alttestgen.py
index 20640293..082e6830 100644
--- a/tests/alttestgen.py
+++ b/tests/alttestgen.py
@@ -68,20 +68,20 @@ def __init__(self, contigs={}):
     def add(self, contig):
         self.contigs[contig.name] = contig
 
-    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.05):
+    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.01):
         pc = self.contigs[parent]
         altseq = random_mutate(pc.seq[start:stop], pmut)
         if (isRC):
             altseq = rc(altseq)
         self.add(Contig(name, accession, altseq, True, parent, start, isRC))
 
-    def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None):
+    def make_read(self, chr, pos, isReverse=False, len=100, pmut=.01, id=None):
         if id == None:
             id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f'))
         seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut)
         return Read(id, chr, pos, seq)
 
-    def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
+    def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.01):
         id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1)
         r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1")
         r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2")
@@ -100,7 +100,10 @@ def to_sam_pair(self, read1, read2):
     def write_fasta(self, filename):
         with open(filename, 'w') as file:
             for write_alts in [False, True]:
-                for contig in self.contigs.values():
+                cnames = self.contigs.keys()
+                cnames.sort()
+                for cname in cnames:
+                    contig = self.contigs[cname]
                     if contig.isAlt == write_alts:
                         file.write(">{}|gb|{}\n".format(contig.name, contig.accession))
                         LINE_LEN=100
@@ -117,18 +120,29 @@ def write_alts(self, filename):
                         1, len(contig.seq), 1 + contig.parentLoc, contig.parentLoc + len(contig.seq), 0, 0))
 
 g = Genome()
-g.add(Contig("chr1", "C01", random_bases(5000)))
+seq = random_bases(7000)
+seq = seq + random_mutate(seq[5000:6000]) + random_bases(1000) + random_mutate(seq[5000:6000]) + random_bases(1000)
+g.add(Contig("chr1", "C01", seq))
 g.add_alt("chr1a", "C01A", "chr1", 1000, 2000)
 g.add_alt("chr1b", "C01B", "chr1", 3000, 4000, True)
+g.add_alt("chr1c", "C01C", "chr1", 5000, 6000)
+g.add_alt("chr1d", "C01D", "chr1", 7000, 8000, True)
+g.add_alt("chr1e", "C01E", "chr1", 9000, 10000)
+g.add_alt("chr1f", "C01F", "chr1", 9000, 10000)
 g.write_fasta("test.fa")
 g.write_alts("test_alts.txt")
 
 with open("test.sam", "w") as file:
-    for i in [100, 150, 200, 250, 2100, 2150, 2200, 2250]:
-        if i < 2000:
-            [r1, r2] = g.make_pair('chr1', i, 'chr1a' , i)
-        else:
-            [r1, r2] = g.make_pair('chr1', i, 'chr1b' , 900 - (i - 2000))
-        file.write(g.to_sam_pair(r1,r2))
-        [r1, r2] = g.make_pair('chr1', i, 'chr1', i+1000)
-        file.write(g.to_sam_pair(r1,r2))
+    for i in [0, 100, 600, 800, 900]:
+        for j in range(5):
+            start = j * 2000
+            chralt = ['chr1a', 'chr1b', 'chr1c', 'chr1d', 'chr1e'][j]
+            ialt = 900 - i if g.contigs[chralt].isAltRC else i
+            [r1, r2] = g.make_pair('chr1', i + start, chralt, ialt)
+            file.write(g.to_sam_pair(r1,r2))
+            [r1, r2] = g.make_pair('chr1', i + start, 'chr1', i + start + 1000)
+            file.write(g.to_sam_pair(r1,r2))
+            [r1, r2] = g.make_pair(chralt, ialt, 'chr1', i + start + 2000)
+            file.write(g.to_sam_pair(r1,r2))
+            [r1, r2] = g.make_pair('chr1', i + start + 1000, 'chr1', i + start + 2000)
+            file.write(g.to_sam_pair(r1,r2))

From 6a4bfd8ff07b6202d66daf0cd7b38ab7773c096d Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Mon, 11 Jan 2016 13:21:34 -0800
Subject: [PATCH 14/19] Compile on Linux

---
 SNAPLib/GenomeIndex.cpp | 34 +++++++++++++++++++++++-----------
 tests/alttestgen.py     |  8 +++-----
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index a257c3c0..08825b83 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -540,6 +540,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		WriteStatusMessage("%llds\n", (timeInMillis() - spillDone + 500) / 1000);
 	}
 
+    // declare variables before goto
+    _uint64 nBackpointersProcessed;
+    _int64 lastPrintTime;
+    const unsigned maxHistogramEntry = 500000;
+    _uint64 countOfTooBigForHistogram;
+    _uint64  sumOfTooBigForHistogram;
+    _uint64 largestSeed;
+    unsigned *histogram;
+    FILE *tablesFile;
+    size_t totalBytesWritten;
+    _uint64 overflowTableIndex;
+    _uint64 duplicateSeedsProcessed;
+
     if (unliftedIndex != NULL && liftedIndexPass == 1) {
         goto lifted_skip_overflow;
     }
@@ -571,14 +584,13 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 		soft_exit(1);
 	}
 
-    _uint64 nBackpointersProcessed = 0;
-    _int64 lastPrintTime = timeInMillis();
+    nBackpointersProcessed = 0;
+    lastPrintTime = timeInMillis();
 
-    const unsigned maxHistogramEntry = 500000;
-    _uint64 countOfTooBigForHistogram = 0;
-    _uint64  sumOfTooBigForHistogram = 0;
-    _uint64 largestSeed = 0;
-    unsigned *histogram = NULL;
+    countOfTooBigForHistogram = 0;
+    sumOfTooBigForHistogram = 0;
+    largestSeed = 0;
+    histogram = NULL;
     if (buildHistogram) {
         histogram = new unsigned[maxHistogramEntry+1];
         for (unsigned i = 0; i <= maxHistogramEntry; i++) {
@@ -591,15 +603,15 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 	// Write the hash tables as we go so that we can free their memory on the fly.
 	//
 	snprintf(filenameBuffer,filenameBufferSize,"%s%c%s", directoryName, PATH_SEP, GenomeIndexHashFileName);
-    FILE *tablesFile = fopen(filenameBuffer, "wb");
+    tablesFile = fopen(filenameBuffer, "wb");
     if (NULL == tablesFile) {
         WriteErrorMessage("Unable to open hash table file '%s'\n", filenameBuffer);
         soft_exit(1);
     }
 
-    size_t totalBytesWritten = 0;
-    _uint64 overflowTableIndex = 0;
-	_uint64 duplicateSeedsProcessed = 0;
+    totalBytesWritten = 0;
+    overflowTableIndex = 0;
+    duplicateSeedsProcessed = 0;
 
 	for (unsigned whichHashTable = 0; whichHashTable < nHashTables; whichHashTable++) {
 		if (NULL == hashTables[whichHashTable]) {
diff --git a/tests/alttestgen.py b/tests/alttestgen.py
index 082e6830..f92bc700 100644
--- a/tests/alttestgen.py
+++ b/tests/alttestgen.py
@@ -11,8 +11,6 @@
 import subprocess
 import random
 
-import pandas as pd
-
 BASES = "ACTG"
 RCBASES = {"A":"T", "T":"A", "C":"G", "G":"C"}
 
@@ -68,20 +66,20 @@ def __init__(self, contigs={}):
     def add(self, contig):
         self.contigs[contig.name] = contig
 
-    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.01):
+    def add_alt(self, name, accession, parent, start, stop, isRC=False, pmut=0.02):
         pc = self.contigs[parent]
         altseq = random_mutate(pc.seq[start:stop], pmut)
         if (isRC):
             altseq = rc(altseq)
         self.add(Contig(name, accession, altseq, True, parent, start, isRC))
 
-    def make_read(self, chr, pos, isReverse=False, len=100, pmut=.01, id=None):
+    def make_read(self, chr, pos, isReverse=False, len=100, pmut=.02, id=None):
         if id == None:
             id = "r{:05d}_{}_{}_{}".format(random.randint(0,99999), chr, pos+1, ('r' if isReverse else 'f'))
         seq = random_mutate(self.contigs[chr].seq[pos:pos + len], pmut)
         return Read(id, chr, pos, seq)
 
-    def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.01):
+    def make_pair(self, chr1, pos1, chr2, pos2, len=100, pmut=.02):
         id = "r{:05d}_{}_{}_{}_{}".format(random.randint(0,99999), chr1, pos1 + 1, chr2, pos2 + 1)
         r1 = self.make_read(chr1, pos1, False, len, pmut, id + "/1")
         r2 = self.make_read(chr2, pos2, True, len, pmut, id + "/2")

From e0df34d43302c55f3f5a944647859cac0754ea8c Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Tue, 12 Jan 2016 10:52:08 -0800
Subject: [PATCH 15/19] Calculate bias table for lifted index

---
 SNAPLib/GenomeIndex.cpp | 108 ++++++++++++++++++++++++++++++----------
 SNAPLib/GenomeIndex.h   |  13 +++--
 2 files changed, 89 insertions(+), 32 deletions(-)

diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 08825b83..43b4563f 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -337,6 +337,9 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
     // Compute bias table sizes, unless we're using the precomputed ones hardcoded in BiasTables.cpp
     double *biasTable = NULL;
+    if (unliftedIndex != NULL) {
+        computeBias = true;
+    }
     if (!computeBias) {
         if (large) {
             biasTable = hg19_biasTables_large[hashTableKeySize][seedLen];
@@ -353,7 +356,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     if (computeBias) {
         unsigned nHashTables = 1 << ((max((unsigned)seedLen, hashTableKeySize * 4) - hashTableKeySize * 4) * 2);
         biasTable = new double[nHashTables];
-        ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large);
+        ComputeBiasTable(genome, seedLen, biasTable, maxThreads, forceExact, hashTableKeySize, large, unliftedIndex);
     }
 
     WriteStatusMessage("Allocating memory for hash tables...");
@@ -980,7 +983,7 @@ GenomeIndex::~GenomeIndex()
 }
 
     void
-GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large)
+GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex)
 /**
  * Fill in table with the table size biases for a given genome and seed size.
  * We assume that table is already of the correct size for our seed size
@@ -1053,12 +1056,14 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table,
 
 			_ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
 
-
-			if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
-				_uint64 value = 42;
-				seedsSeen->Insert(seed.getBases(), &value);
-				numExactSeeds[seed.getHighBases(hashTableKeySize)]++;
-			}
+            _int64 nHits, nRCHits;
+            if (unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex, &nHits, &nRCHits)) {
+                if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
+                    _uint64 value = 42;
+                    seedsSeen->Insert(seed.getBases(), &value);
+                    numExactSeeds[seed.getHighBases(hashTableKeySize)]++;
+                }
+            }
         }
 
 //      for (unsigned i = 0; i < nHashTables; i++) printf("Hash table %d is predicted to have %lld entries\n", i, numExactSeeds[i]);
@@ -1102,6 +1107,8 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table,
             contexts[i].validSeeds = &validSeeds;
             contexts[i].approximateCounterLocks = locks;
 			contexts[i].large = large;
+            contexts[i].unliftedIndex = unliftedIndex;
+
 
             StartNewThread(ComputeBiasTableWorkerThreadMain, &contexts[i]);
         }
@@ -1206,19 +1213,30 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
 
 			_ASSERT(whichHashTable < context->nHashTables);
 
-			if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
-				PerCounterBatch *batch = &batches[whichHashTable];
-				AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
-				batch->apply(&(*context->approxCounters)[whichHashTable]);    
-				ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+            _int64 nRepeats = 1;
+            if (context->unliftedIndex != NULL) {
+                _int64 nHits, nRCHits;
+                if (hasAnyAltHits(seed, i, context->unliftedIndex, &nHits, &nRCHits)) {
+                    nRepeats = nHits + nRCHits;
+                } else {
+                    nRepeats = 0;
+                }
+            }
+            for (; nRepeats > 0; nRepeats--) {
+                if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
+                    PerCounterBatch *batch = &batches[whichHashTable];
+                    AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+                    batch->apply(&(*context->approxCounters)[whichHashTable]);
+                    ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
 
-				_int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds);
+                    _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds);
 
-				if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds)/printBatchSize) {
-					WriteStatusMessage("Bias computation: %lld / %lld\n",(basesProcessed/printBatchSize)*printBatchSize, (_int64)countOfBases);
-				}
-				unrecordedSkippedSeeds= 0;  // We've now recorded them.
-			}
+                    if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) {
+                        WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases);
+                    }
+                    unrecordedSkippedSeeds = 0;  // We've now recorded them.
+                }
+            }
     }
 
     for (unsigned i = 0; i < context->nHashTables; i++) {
@@ -1246,7 +1264,43 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
     }
 }
 
-
+bool
+GenomeIndex::hasAnyAltHits(
+    Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits)
+{
+    if (unliftedIndex == NULL) {
+        return false;
+    }
+    _int64 nHits, nRCHits;
+    if (unliftedIndex->doesGenomeIndexHave64BitLocations()) {
+        const GenomeLocation *hits, *rcHits;
+        GenomeLocation singleHit[2], singleRCHit[2];
+        unliftedIndex->lookupSeed(seed, pnHits, &hits, pnRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
+        *pnHits = nHits;
+        *pnRCHits = nRCHits;
+#define HAS_ANY_ALTS \
+        if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
+            (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \
+            for (int i = 0; i < nHits; i++) { \
+                if (unliftedIndex->genome->isAltLocation(hits[i])) { \
+                    return true; \
+                } \
+            } \
+            for (int i = 0; i < nRCHits; i++) { \
+                if (unliftedIndex->genome->isAltLocation(hits[i])) { \
+                return true; \
+                } \
+            } \
+            return false; \
+        }
+        HAS_ANY_ALTS
+    } else {
+        const unsigned *hits, *rcHits;
+        unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
+        HAS_ANY_ALTS
+    }
+}
+#undef HAS_ANY_ALTS
 
     void 
 GenomeIndex::BuildHashTablesWorkerThreadMain(void *param)
@@ -1375,11 +1429,11 @@ GenomeIndex::indexLiftedSeed(GenomeLocation genomeLocation, Seed seed, PerHashTa
         if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
                 (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \
             bool anyAlts = false; \
-            for (int i = 0; i < nHits && ! anyAlts; i++) { \
-                anyAlts = genome->getLiftedLocation(hits[i]) != hits[i]; \
+            for (int i = 0; i < nHits && !anyAlts; i++) { \
+                anyAlts = genome->isAltLocation(hits[i]); \
             } \
             for (int i = 0; i < nRCHits && !anyAlts; i++) { \
-                anyAlts = genome->getLiftedLocation(rcHits[i]) != rcHits[i]; \
+                anyAlts = genome->isAltLocation(rcHits[i]); \
             } \
             if (anyAlts) { \
                 for (int i = 0; i < nHits; i++) { \
@@ -2128,7 +2182,7 @@ GenomeIndex::lookupSeed32(
     _int64           *nHits,
     const unsigned  **hits,
     _int64           *nRCHits,
-    const unsigned  **rcHits)
+    const unsigned  **rcHits) const
 {
     _ASSERT(locationSize == 4);   // This is the caller's responsibility to check.
 
@@ -2216,7 +2270,7 @@ GenomeIndex::lookupSeedAlt32(
 GenomeIndex::fillInLookedUpResults32(
     const unsigned  *subEntry,
     _int64          *nHits, 
-    const unsigned **hits)
+    const unsigned **hits) const
 {
     //
     // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at 
@@ -2265,7 +2319,7 @@ GenomeIndex::lookupSeed(
     _int64 *                nRCHits, 
     const GenomeLocation ** rcHits, 
     GenomeLocation *        singleHit, 
-    GenomeLocation *        singleRCHit)
+    GenomeLocation *        singleRCHit) const
 {
     _ASSERT(locationSize > 4 && locationSize <= 8);
 
@@ -2367,7 +2421,7 @@ GenomeIndex::lookupSeedAlt(
 }
 
     void 
-GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation)
+GenomeIndex::fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const
 {
      //
     // WARNING: the code in the IntersectingPairedEndAligner relies on being able to look at 
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index c0a45840..c0897a79 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -47,8 +47,8 @@ class GenomeIndex {
     // be pointed to as a return value.  When only a single hit is returned, *hits == singleHit, so there's
     // no need to check on the caller's side.
     //
-    void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit);
-    void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits);
+    void lookupSeed(Seed seed, _int64 *nHits, const GenomeLocation **hits, _int64 *nRCHits, const GenomeLocation **rcHits, GenomeLocation *singleHit, GenomeLocation *singleRCHit) const;
+    void lookupSeed32(Seed seed, _int64 *nHits, const unsigned **hits, _int64 *nRCHits, const unsigned **rcHits) const;
 
     // versions for genome that has alt regions
     // hits/rcHits locations are lifted to non-alt contigs, unliftedHits/unliftedRCHits are original locations in alt contigs
@@ -185,7 +185,7 @@ class GenomeIndex {
     static double *hg19_biasTables[largestKeySize+1][largestBiasTable+1];
     static double *hg19_biasTables_large[largestKeySize+1][largestBiasTable+1];
 
-    static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large);
+    static void ComputeBiasTable(const Genome* genome, int seedSize, double* table, unsigned maxThreads, bool forceExact, unsigned hashTableKeySize, bool large, const GenomeIndex* unliftedIndex = NULL);
 
     struct ComputeBiasTableThreadContext {
         SingleWaiterObject              *doneObject;
@@ -200,12 +200,15 @@ class GenomeIndex {
         unsigned                         seedLen;
         volatile _int64                 *validSeeds;
 		bool							 large;
+        const GenomeIndex               *unliftedIndex;
 
         ExclusiveLock                   *approximateCounterLocks;
     };
 
     static void ComputeBiasTableWorkerThreadMain(void *param);
 
+    static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits);
+
     struct OverflowBackpointer;
 
     struct BuildHashTablesThreadContext {
@@ -316,6 +319,6 @@ class GenomeIndex {
 						BuildHashTablesThreadContext*context,
                         GenomeLocation               genomeLocation);
 
-    void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits);
-    void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation);
+    void fillInLookedUpResults32(const unsigned *subEntry, _int64 *nHits, const unsigned **hits) const;
+    void fillInLookedUpResults(GenomeLocation lookedUpLocation, _int64 *nHits, const GenomeLocation **hits, GenomeLocation *singleHitLocation) const;
 };

From ea14df263644f305530272cf36de0eb5f0ca888c Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Tue, 12 Jan 2016 14:49:04 -0800
Subject: [PATCH 16/19] Cleanup status printing for alt index build

---
 SNAPLib/GenomeIndex.cpp | 43 ++++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 20 deletions(-)

diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 43b4563f..637bf9b9 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -382,14 +382,6 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     start = timeInMillis();
     volatile _int64 nextOverflowBackpointer = 0;
 
-    volatile _int64 nonSeeds = 0;
-    volatile _int64 seedsWithMultipleOccurrences = 0;
-    volatile _int64 genomeLocationsInOverflowTable = 0;     // Number of extra hits on duplicate indices.  This should come out once we implement the overflow table.
-    volatile _int64 bothComplementsUsed = 0;    // Number of hash buckets where both complements are used
-    volatile _int64 noBaseAvailable = 0;        // Number of places where getSubstring returned null.
-    volatile _int64 nBasesProcessed = 0;
-    volatile int runningThreadCount;
-
     unsigned nThreads = __min(GetNumberOfProcessors(), maxThreads);
     BuildHashTablesThreadContext *threadContexts = new BuildHashTablesThreadContext[nThreads];
 
@@ -403,6 +395,14 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 
 lifted_index_pass_start:
 
+    volatile _int64 nonSeeds = 0;
+    volatile _int64 seedsWithMultipleOccurrences = 0;
+    volatile _int64 genomeLocationsInOverflowTable = 0;     // Number of extra hits on duplicate indices.  This should come out once we implement the overflow table.
+    volatile _int64 bothComplementsUsed = 0;    // Number of hash buckets where both complements are used
+    volatile _int64 noBaseAvailable = 0;        // Number of places where getSubstring returned null.
+    volatile _int64 nBasesProcessed = 0;
+    volatile int runningThreadCount;
+
     SingleWaiterObject doneObject;
     CreateSingleWaiterObject(&doneObject);
 
@@ -489,18 +489,19 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
 //                (_int64)hashTables[j]->GetUsedElementCount() * 100 / (_int64)hashTables[j]->GetTableSize());
     }
 
-    WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n",
-        seedsWithMultipleOccurrences,
-        (seedsWithMultipleOccurrences * 100) / countOfBases,
-        genomeLocationsInOverflowTable,
-        genomeLocationsInOverflowTable * 100 / countOfBases,
-        nonSeeds,
-        (nonSeeds * 100) / countOfBases,
-        bothComplementsUsed,
-        noBaseAvailable);
-
-    WriteStatusMessage("Hash table build took %llds\n",(timeInMillis() + 500 - start) / 1000);
-
+    if (unliftedIndex == NULL) {
+        WriteStatusMessage("%lld(%lld%%) seeds occur more than once, total of %lld(%lld%%) genome locations are not unique, %lld(%lld%%) bad seeds, %lld both complements used %lld no string\n",
+            seedsWithMultipleOccurrences,
+            (seedsWithMultipleOccurrences * 100) / countOfBases,
+            genomeLocationsInOverflowTable,
+            genomeLocationsInOverflowTable * 100 / countOfBases,
+            nonSeeds,
+            (nonSeeds * 100) / countOfBases,
+            bothComplementsUsed,
+            noBaseAvailable);
+
+        WriteStatusMessage("Hash table build took %llds\n", (timeInMillis() + 500 - start) / 1000);
+    }
     //
     // We're done with the raw genome.  Delete it to save some memory.
     //
@@ -765,6 +766,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
     if (genome != NULL && genome->hasAltContigs() && unliftedIndex == NULL) {
         // create a sub-index with only seeds that occur in alt contigs
         // need to build lifted index here because it will reorder unlifted index overflow table
+        WriteStatusMessage("Creating sub-index for alt contigs...\n");
         snprintf(filenameBuffer, filenameBufferSize, "%s%c%s", directoryName, PATH_SEP, LiftedIndexDirName);
         bool ok = BuildIndexToDirectory(genome, seedLen, slack, true, filenameBuffer, maxThreads, chromosomePaddingSize, forceExact,
             hashTableKeySize, large, histogramFileName, locationSize, smallMemory, index);
@@ -773,6 +775,7 @@ GenomeIndex::BuildIndexToDirectory(const Genome *genome, int seedLen, double sla
             soft_exit(1);
             return false;
         }
+        WriteStatusMessage("...Finished creating sub-index for alt contigs\n");
     }
 
     //

From 8d12f71421b2630d2b47b1e88ed4ce06becefbc7 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Tue, 12 Jan 2016 15:16:49 -0800
Subject: [PATCH 17/19] Pre-alloc alt match info

---
 SNAPLib/BaseAligner.cpp      | 25 ++++++++++-----------
 SNAPLib/BaseAligner.h        |  2 +-
 SNAPLib/VariableSizeVector.h | 42 +++++++++++++++++++++++++-----------
 3 files changed, 44 insertions(+), 25 deletions(-)

diff --git a/SNAPLib/BaseAligner.cpp b/SNAPLib/BaseAligner.cpp
index cd6b72eb..febad5f6 100644
--- a/SNAPLib/BaseAligner.cpp
+++ b/SNAPLib/BaseAligner.cpp
@@ -227,8 +227,8 @@ Routine Description:
     hashTableEpoch = 0;
 
     if (genome->hasAltContigs()) {
-        // todo: BigAlloc / new(allocator) -> fixed size, avoid reallocs; reserve space for max size
-        allMatches = new MatchInfoVector();
+        MatchInfo* entries = (MatchInfo*)allocator->allocate(sizeof(MatchInfo)* maxHitsToConsider);
+        allMatches = new MatchInfoVector(entries, maxHitsToConsider);
     }
  
 }
@@ -1503,17 +1503,18 @@ BaseAligner::getBigAllocatorReservation(GenomeIndex *index, bool ownLandauVishki
     }
 
     return
-        contigCounters                                                  +
-        sizeof(_uint64) * 14                                            + // allow for alignment
-        sizeof(BaseAligner)                                             + // our own member variables
+        contigCounters +
+        sizeof(_uint64)* 14 + // allow for alignment
+        sizeof(BaseAligner)+ // our own member variables
         (ownLandauVishkin ?
-            LandauVishkin<>::getBigAllocatorReservation() +
-            LandauVishkin<-1>::getBigAllocatorReservation() : 0)        + // our LandauVishkin objects
-        sizeof(char) * maxReadSize * 2                                  + // rcReadData
-        sizeof(char) * maxReadSize * 4 + 2 * MAX_K                      + // reversed read (both)
-        sizeof(BYTE) * (maxReadSize + 7 + 128) / 8                      + // seed used
-        sizeof(HashTableElement) * hashTableElementPoolSize             + // hash table element pool
-        sizeof(HashTableAnchor) * candidateHashTablesSize * 2           + // candidate hash table (both)
+        LandauVishkin<>::getBigAllocatorReservation() +
+        LandauVishkin<-1>::getBigAllocatorReservation() : 0) + // our LandauVishkin objects
+        sizeof(char)* maxReadSize * 2 + // rcReadData
+        sizeof(char)* maxReadSize * 4 + 2 * MAX_K + // reversed read (both)
+        sizeof(BYTE)* (maxReadSize + 7 + 128) / 8 + // seed used
+        sizeof(HashTableElement)* hashTableElementPoolSize + // hash table element pool
+        sizeof(HashTableAnchor)* candidateHashTablesSize * 2 + // candidate hash table (both)
+        (index->getGenome()->hasAltContigs() ? sizeof(MatchInfo) * maxHitsToConsider : 0) + // matches for alt contigs
         sizeof(HashTableElement) * (maxSeedsToUse + 1);                   // weight lists
 }
 
diff --git a/SNAPLib/BaseAligner.h b/SNAPLib/BaseAligner.h
index ee369a7c..5a876c04 100644
--- a/SNAPLib/BaseAligner.h
+++ b/SNAPLib/BaseAligner.h
@@ -342,7 +342,7 @@ class BaseAligner {
         return a.liftedLocation < b.liftedLocation;
     }
 
-    typedef VariableSizeVector<MatchInfo> MatchInfoVector;
+    typedef VariableSizeVector<MatchInfo,0> MatchInfoVector;
 
     MatchInfoVector* allMatches;
 
diff --git a/SNAPLib/VariableSizeVector.h b/SNAPLib/VariableSizeVector.h
index e32d3dc2..f1b21bc7 100644
--- a/SNAPLib/VariableSizeVector.h
+++ b/SNAPLib/VariableSizeVector.h
@@ -4,6 +4,7 @@
 
 //
 // A variable-size vector that does not perform any memory allocation except to grow.
+// if grow==0 then it must be supplied with a vector to start and it will NOT grow or deallocate
 //
 template<typename V, int grow = 150, bool big = false>
 class VariableSizeVector
@@ -17,31 +18,43 @@ class VariableSizeVector
             WriteErrorMessage("%s: allocate %lld - consider using BigAlloc\n", __FUNCTION__, bytes);
         }
 #endif
+        _ASSERT(grow != 0);
         return big ? BigAlloc(bytes) : malloc(bytes);
     }
 
     inline static void deallocate(void* p)
     {
-        if (big) { BigDealloc(p); } else { free(p); }
+        if (grow != 0) {
+            if (big) { BigDealloc(p); } else { free(p); }
+        }
     }
 
 public:
+
     VariableSizeVector(int i_capacity = 16)
         : entries(NULL), count(0), capacity(i_capacity)
-    {}
+    { _ASSERT(grow != 0); }
     
-    VariableSizeVector(VariableSizeVector& other)
+    VariableSizeVector(V* i_entries, int i_capacity)
+        : entries(i_entries), capacity(i_capacity), count(0)
+    { _ASSERT(grow == 0); }
+
+    VariableSizeVector(VariableSizeVector<V,grow,big>& other)
         : entries(other.entries), count(other.count), capacity(other.capacity)
     {
         other.count = 0;
+        other.capacity = 0;
         other.entries = NULL;
     }
 
     ~VariableSizeVector()
     {
         if (entries != NULL) {
-            deallocate(entries);
+            if (grow != 0) {
+                deallocate(entries);
+            }
             entries = NULL;
+            capacity = 0;
             count = 0;
         }
     }
@@ -57,7 +70,7 @@ class VariableSizeVector
     }
 
 public:
-    void operator=(VariableSizeVector<V>& other)
+    void operator=(VariableSizeVector<V,grow,big>& other)
     {
         entries = other.entries;
         capacity = other.capacity;
@@ -68,6 +81,13 @@ class VariableSizeVector
 
     void reserve(_int64 newCapacity)
     {
+        if (grow == 0) {
+            if (newCapacity <= capacity) {
+                return;
+            }
+            WriteErrorMessage("Unable to grow fixed VariableSizeVector from %ld to %ld\n", capacity, newCapacity);
+            soft_exit(1);
+        }
         _ASSERT(newCapacity >= 0);
         if (newCapacity <= capacity && entries != NULL) {
             return;
@@ -89,8 +109,10 @@ class VariableSizeVector
     inline void clean()
     {
         if (entries != NULL) {
-            deallocate(entries);
-            entries = NULL;
+            if (grow != 0) {
+                deallocate(entries);
+                entries = NULL;
+            }
             count = 0;
         }
     }
@@ -109,11 +131,7 @@ class VariableSizeVector
     
     inline void push_back(V& value)
     {
-        if (entries == NULL) {
-            reserve(capacity);
-        } else if (count == capacity) {
-            reserve((int) (((_int64) count * grow) / 100));
-        }
+        increase();
         _ASSERT(count < capacity);
         entries[count++] = value;
     }

From fa17fff4104b2679aceaaf14e0fb6c5676fc92af Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Thu, 14 Jan 2016 09:32:27 -0800
Subject: [PATCH 18/19] Add parsing options for v38 ref

---
 SNAPLib/FASTA.cpp                        | 111 ++++++++++++++++++-----
 SNAPLib/FASTA.h                          |   6 +-
 SNAPLib/Genome.cpp                       |  31 +++----
 SNAPLib/Genome.h                         |   2 +-
 SNAPLib/GenomeIndex.cpp                  |  88 ++++++++++--------
 SNAPLib/GenomeIndex.h                    |   2 +-
 SNAPLib/IntersectingPairedEndAligner.cpp |  17 ++--
 SNAPLib/IntersectingPairedEndAligner.h   |   1 -
 8 files changed, 165 insertions(+), 93 deletions(-)

diff --git a/SNAPLib/FASTA.cpp b/SNAPLib/FASTA.cpp
index 659bc990..05ae845d 100644
--- a/SNAPLib/FASTA.cpp
+++ b/SNAPLib/FASTA.cpp
@@ -37,6 +37,8 @@ ReadFASTAGenome(
     const char *pieceNameTerminatorCharacters,
     bool spaceIsAPieceNameTerminator,
     unsigned chromosomePaddingSize,
+    const char *chrTag,
+    const char *chrMapFilename,
     AltContigMap* altMap)
 {
     //
@@ -54,15 +56,38 @@ ReadFASTAGenome(
     isValidGenomeCharacter['A'] = isValidGenomeCharacter['T'] = isValidGenomeCharacter['C'] = isValidGenomeCharacter['G'] = isValidGenomeCharacter['N'] = true;
     isValidGenomeCharacter['a'] = isValidGenomeCharacter['t'] = isValidGenomeCharacter['c'] = isValidGenomeCharacter['g'] = isValidGenomeCharacter['n'] = true;
 
+    int lineBufferSize = 0;
+    char *lineBuffer;
+
+    map<string, string> chrMap;
+    if (chrMapFilename != NULL) {
+        FILE* mapFile = fopen(chrMapFilename, "r");
+        if (mapFile == NULL) {
+            WriteErrorMessage("Unable to open -chrmap file '%s'\n", chrMapFilename);
+            return NULL;
+        }
+        while (NULL != reallocatingFgets(&lineBuffer, &lineBufferSize, mapFile)) {
+            if (lineBuffer[0] == '#') {
+                continue;
+            }
+            string chrom;
+            for (char * token = strtok(lineBuffer, "\t\r\n"); token != NULL; token = strtok(NULL, "\t\r\n")) {
+                if (token == lineBuffer) {
+                    chrom = string(token);
+                } else {
+                    chrMap[string(token)] = chrom;
+                }
+            }
+        }
+        fclose(mapFile);
+    }
+
     FILE *fastaFile = fopen(fileName, "r");
     if (fastaFile == NULL) {
         WriteErrorMessage("Unable to open FASTA file '%s' (even though we already got its size)\n",fileName);
         return NULL;
     }
 
-    int lineBufferSize = 0;
-    char *lineBuffer;
- 
     //
     // Count the chromosomes
     //
@@ -97,39 +122,59 @@ ReadFASTAGenome(
             //
             // Now supply the chromosome name.
             //
-            char * terminator = lineBuffer + strlen(lineBuffer);
-            char * p;
-            if (NULL != pieceNameTerminatorCharacters) {
-                for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
-                    p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
+            const char *chrName;
+            int chrNameLen;
+            if (chrTag == NULL) {
+                char * terminator = lineBuffer + strlen(lineBuffer);
+                char * p;
+                if (NULL != pieceNameTerminatorCharacters) {
+                    for (int i = 0; i < strlen(pieceNameTerminatorCharacters); i++) {
+                        p = strchr(lineBuffer + 1, pieceNameTerminatorCharacters[i]);
+                        if (NULL != p && p < terminator) {
+                            terminator = p;
+                        }
+                    }
+                }
+                if (spaceIsAPieceNameTerminator) {
+                    p = strchr(lineBuffer, ' ');
+                    if (NULL != p && p < terminator) {
+                        terminator = p;
+                    }
+                    p = strchr(lineBuffer, '\t');
                     if (NULL != p && p < terminator) {
                         terminator = p;
                     }
                 }
-            }
-            if (spaceIsAPieceNameTerminator) {
-                p = strchr(lineBuffer, ' ');
+                p = strchr(lineBuffer, '\n');
                 if (NULL != p && p < terminator) {
                     terminator = p;
                 }
-                p = strchr(lineBuffer, '\t');
+                p = strchr(lineBuffer, '\r');
                 if (NULL != p && p < terminator) {
                     terminator = p;
                 }
-            }
-            p = strchr(lineBuffer, '\n');
-            if (NULL != p && p < terminator) {
-                terminator = p;
-            }
-            p = strchr(lineBuffer, '\r');
-            if (NULL != p && p < terminator) {
-                terminator = p;
+                chrName = lineBuffer + 1;
+                chrNameLen = (int) (terminator - lineBuffer - 1);
+            } else {
+                if (!FindFASTATagValue(lineBuffer, chrTag, &chrName, &chrNameLen)) {
+                    WriteErrorMessage("Unable to find tag '%s' in contig '%s'\n", chrTag, lineBuffer + 1);
+                    soft_exit(1);
+                }
+                if (chrMapFilename != NULL) {
+                    map<string,string>::iterator mapped = chrMap.find(string(chrName, chrName + chrNameLen));
+                    if (mapped != chrMap.end()) {
+                        chrName = mapped->second.data();
+                        chrNameLen = (int) mapped->second.length();
+                    }
+                }
             }
             if (altMap != NULL) {
-                altMap->addFastaContig(lineBuffer, terminator);
+                altMap->addFastaContig(lineBuffer, chrName, chrNameLen);
             }
-            *terminator = '\0';
-            genome->startContig(lineBuffer+1, altMap);
+            char *contigName = (char*) malloc(chrNameLen + 1);
+            memcpy(contigName, chrName, chrNameLen);
+            contigName[chrNameLen] = '\0';
+            genome->startContig(contigName, altMap);
         } else {
             if (!inAContig) {
                 WriteErrorMessage("\nFASTA file doesn't beging with a contig name (i.e., the first line doesn't start with '>').\n");
@@ -208,3 +253,23 @@ bool AppendFASTAGenome(const Genome *genome, FILE *fasta, const char *prefix="")
     }
     return !ferror(fasta);
 }
+
+    bool
+FindFASTATagValue(const char* lineBuffer, const char* tagName, const char ** pTagValue, int * pValueLength)
+{
+    const char *tag = lineBuffer;
+    do {
+        tag = strstr(tag + 1, tagName);
+        if (tag == NULL) {
+            return false;
+        }
+    } while (tag[-1] != '>' && tag[-1] != '|' && tag[strlen(tagName)] != '|');
+    *pTagValue = tag + strlen(tagName) + 1; // Format is "tag|value|
+    const char *tagValueEnd = strchr(*pTagValue, '|');
+    if (tagValueEnd == NULL) {
+        WriteErrorMessage("Badly formatted tag '%s' in contig '%s'\n", tag, lineBuffer + 1);
+        soft_exit(1);
+    }
+    *pValueLength = (int) (tagValueEnd - *pTagValue);
+    return true;
+}
diff --git a/SNAPLib/FASTA.h b/SNAPLib/FASTA.h
index cda9a1f8..f3c0e3d3 100644
--- a/SNAPLib/FASTA.h
+++ b/SNAPLib/FASTA.h
@@ -27,7 +27,7 @@ Revision History:
 #include "Genome.h"
 
     const Genome *
-ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, AltContigMap* altMap);
+ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters, bool spaceIsAPieceNameTerminator, unsigned chromosomePaddingSize, const char* chrTag, const char* chrMapFilename, AltContigMap* altMap);
 
 //
 // The FASTA appending functions return whether the write was successful.
@@ -39,3 +39,7 @@ ReadFASTAGenome(const char *fileName, const char *pieceNameTerminatorCharacters,
 
     bool
 AppendFASTAGenome(const Genome *, FILE *fasta);
+
+// utility for parsing FASTA tags
+    bool
+FindFASTATagValue(const char* lineBuffer, const char* tag, const char ** pTagValue, int * pValueLength);
diff --git a/SNAPLib/Genome.cpp b/SNAPLib/Genome.cpp
index 8a6a4fb4..490393b5 100755
--- a/SNAPLib/Genome.cpp
+++ b/SNAPLib/Genome.cpp
@@ -24,6 +24,7 @@ Revision History:
 
 #include "stdafx.h"
 #include "Genome.h"
+#include "FASTA.h"
 #include "GenericFile.h"
 #include "GenericFile_map.h"
 #include "Compat.h"
@@ -33,6 +34,9 @@ Revision History:
 #include "Util.h"
 #include "VariableSizeVector.h"
 
+#include <string>
+using namespace std;
+
 Genome::Genome(GenomeDistance i_maxBases, GenomeDistance nBasesStored, unsigned i_chromosomePadding, unsigned i_maxContigs)
 : maxBases(i_maxBases), minLocation(0), maxLocation(i_maxBases), chromosomePadding(i_chromosomePadding), maxContigs(i_maxContigs),
 mappedFile(NULL), minAltLocation(i_maxBases)
@@ -545,7 +549,7 @@ GenomeLocation Genome::getLiftedLocation(GenomeLocation altLocation) const
     if (altLocation < minAltLocation) {
         return altLocation;
     }
-    const Contig* alt = getContigAtLocation(altLocation);
+    const Contig* alt = getContigAtLocation(altLocation + chromosomePadding / 2);
     if (alt == NULL || ! alt->isAlternate) {
         return altLocation;
     }
@@ -655,9 +659,8 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
         soft_exit(1);
     }
     *q = '\0';
-    char * tag = (char*) malloc(q - p + 2);
+    char * tag = (char*) malloc(q - p + 1);
     strcpy(tag, p);
-    strcat(tag, "|");
     result->accessionFastaTag = tag;
 
     // get names for each column type (last 2 are optional)
@@ -760,34 +763,26 @@ AltContigMap* AltContigMap::readFromFile(const char* filename, const char* colum
     return result;
 }
 
-void AltContigMap::addFastaContig(const char* lineBuffer, const char* nameTerminator)
+void AltContigMap::addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength)
 {
     // get the name
-    char* name = (char*) malloc(nameTerminator - lineBuffer);
-    memcpy(name, lineBuffer + 1, nameTerminator - lineBuffer - 1);
-    name[nameTerminator - lineBuffer - 1] = 0;
+    string name(chrName, chrName + chrNameLength);
 
     // find the accession number
-    const char* tag = strstr(lineBuffer, accessionFastaTag);
-    const char* p = tag + strlen(accessionFastaTag);
-    if (tag == NULL || *p == '\0') {
+    const char *accessionStart;
+    int accessionLength;
+    if (!FindFASTATagValue(lineBuffer, accessionFastaTag, &accessionStart, &accessionLength)) {
         WriteErrorMessage("Unable to find accession code for contig %s in FASTA line\n%s\n", name, lineBuffer);
         soft_exit(1);
     }
-    const char*q = p;
-    while (*q != '\0' && *q != '|' && *q != ' ' && *q != '\t' && *q != '\r' && *q != '\n') {
-        q++;
-    }
-    char* accession = (char*)malloc(q - p);
-    memcpy(accession, p, q - p);
-    *(accession + (q - p)) = '\0';
+    string accession(accessionStart, accessionStart + accessionLength);
 
     nameToAccession[name] = accession;
     accessionToName[accession] = name;
 
     StringAltContigMap::iterator alt = altsByAccession.find(accession);
     if (alt != altsByAccession.end()) {
-        alt->second.name = name;
+        alt->second.name = (new string(name))->data(); // alloc & never free, but tiny :-)
     }
 }
 
diff --git a/SNAPLib/Genome.h b/SNAPLib/Genome.h
index c01a3b61..f1d285e5 100644
--- a/SNAPLib/Genome.h
+++ b/SNAPLib/Genome.h
@@ -336,7 +336,7 @@ class AltContigMap
 
     static AltContigMap* readFromFile(const char* filename, const char* columnList);
 
-    void addFastaContig(const char* lineBuffer, const char* terminator);
+    void addFastaContig(const char* lineBuffer, const char* chrName, int chrNameLength);
 
     void setAltContig(Genome::Contig* contig);
 
diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 637bf9b9..12f86df4 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -46,7 +46,7 @@ static const double DEFAULT_SLACK = 0.3;
 static const unsigned DEFAULT_PADDING = 500;
 static const unsigned DEFAULT_KEY_BYTES = 4;
 static const unsigned DEFAULT_LOCATION_SIZE = 4;
-static const char* DEFAULT_ALT_COLUMNS = "gb,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail";
+static const char* DEFAULT_ALT_COLUMNS = "ref,alt_scaf_acc,parent_acc,ori,alt_scaf_start,alt_scaf_stop,parent_start,parent_stop,alt_start_tail,alt_stop_tail";
 const char *GenomeIndexFileName = "GenomeIndex";
 const char *OverflowTableFileName = "OverflowTable";
 const char *GenomeIndexHashFileName = "GenomeIndexHash";
@@ -89,6 +89,8 @@ static void usage()
         "-altmap file       Tab-separated file of alt contig mapping information\n"
         "-altcols columns   Comma-separated list of columns describing alt mapping file\n"
         "                   Default is v38 %s\n"
+        "-chrtag tag        Tag for chrom name\n"
+        "-chrmap file       Tab-separated file of chrom name and tag values\n"
 			,
             DEFAULT_SEED_SIZE,
             DEFAULT_SLACK,
@@ -128,6 +130,8 @@ GenomeIndex::runIndexer(
 	bool smallMemory = false;
     const char* altMapFilename = NULL;
     const char* altMapColumns = DEFAULT_ALT_COLUMNS;
+    const char* chrTag = NULL;
+    const char* chrMapFilename = NULL;
 
     for (int n = 2; n < argc; n++) {
         if (strcmp(argv[n], "-s") == 0) {
@@ -208,12 +212,33 @@ GenomeIndex::runIndexer(
             } else {
                 usage();
             }
+        } else if (!strcmp(argv[n], "-chrtag")) {
+            if (n + 1 < argc) {
+                chrTag = argv[n + 1];
+                n++;
+            }
+            else {
+                usage();
+            }
+        } else if (!strcmp(argv[n], "-chrmap")) {
+            if (n + 1 < argc) {
+                chrMapFilename = argv[n + 1];
+                n++;
+            }
+            else {
+                usage();
+            }
         } else {
             WriteErrorMessage("Invalid argument: %s\n\n", argv[n]);
             usage();
         }
     }
 
+    if (chrMapFilename != NULL && chrTag == NULL) {
+        WriteErrorMessage("The -chrmap option requires the -chrtag option to be specified\n");
+        usage();
+    }
+
     if (seedLen < 16 || seedLen > 32) {
         // Seeds are stored in 64 bits, so they can't be larger than 32 bases for now.
         WriteErrorMessage("Seed length must be between 16 and 32, inclusive\n");
@@ -246,7 +271,7 @@ GenomeIndex::runIndexer(
 
     AltContigMap* altMap = altMapFilename != NULL ? AltContigMap::readFromFile(altMapFilename, altMapColumns) : NULL;
 
-    const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, altMap);
+    const Genome *genome = ReadFASTAGenome(fastaFile, pieceNameTerminatorCharacters, spaceIsAPieceNameTerminator, chromosomePadding, chrTag, chrMapFilename, altMap);
     if (NULL == genome) {
         WriteErrorMessage("Unable to read FASTA file\n");
         soft_exit(1);
@@ -1059,13 +1084,12 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table,
 
 			_ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
 
-            _int64 nHits, nRCHits;
-            if (unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex, &nHits, &nRCHits)) {
-                if (NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
-                    _uint64 value = 42;
-                    seedsSeen->Insert(seed.getBases(), &value);
-                    numExactSeeds[seed.getHighBases(hashTableKeySize)]++;
-                }
+            bool addSeed = unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex) ||
+                ((!large) && hasAnyAltHits(~seed, i, unliftedIndex));
+            if (addSeed && NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
+                _uint64 value = 42;
+                seedsSeen->Insert(seed.getBases(), &value);
+                numExactSeeds[seed.getHighBases(hashTableKeySize)]++;
             }
         }
 
@@ -1216,29 +1240,20 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
 
 			_ASSERT(whichHashTable < context->nHashTables);
 
-            _int64 nRepeats = 1;
-            if (context->unliftedIndex != NULL) {
-                _int64 nHits, nRCHits;
-                if (hasAnyAltHits(seed, i, context->unliftedIndex, &nHits, &nRCHits)) {
-                    nRepeats = nHits + nRCHits;
-                } else {
-                    nRepeats = 0;
-                }
-            }
-            for (; nRepeats > 0; nRepeats--) {
-                if (batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
-                    PerCounterBatch *batch = &batches[whichHashTable];
-                    AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
-                    batch->apply(&(*context->approxCounters)[whichHashTable]);
-                    ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+            bool addSeed = context->unliftedIndex == NULL || hasAnyAltHits(seed, i,  context->unliftedIndex) ||
+                ((!large) && hasAnyAltHits(~seed, i, context->unliftedIndex));
+            if (addSeed && batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
+                PerCounterBatch *batch = &batches[whichHashTable];
+                AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
+                batch->apply(&(*context->approxCounters)[whichHashTable]);
+                ReleaseExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
 
-                    _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds);
+                _int64 basesProcessed = InterlockedAdd64AndReturnNewValue(context->nBasesProcessed, PerCounterBatch::nSeedsPerBatch + unrecordedSkippedSeeds);
 
-                    if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) {
-                        WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases);
-                    }
-                    unrecordedSkippedSeeds = 0;  // We've now recorded them.
+                if ((_uint64)basesProcessed / printBatchSize > ((_uint64)basesProcessed - PerCounterBatch::nSeedsPerBatch - unrecordedSkippedSeeds) / printBatchSize) {
+                    WriteStatusMessage("Bias computation: %lld / %lld\n", (basesProcessed / printBatchSize)*printBatchSize, (_int64)countOfBases);
                 }
+                unrecordedSkippedSeeds = 0;  // We've now recorded them.
             }
     }
 
@@ -1269,18 +1284,13 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
 
 bool
 GenomeIndex::hasAnyAltHits(
-    Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits)
+    Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex)
 {
-    if (unliftedIndex == NULL) {
-        return false;
-    }
     _int64 nHits, nRCHits;
     if (unliftedIndex->doesGenomeIndexHave64BitLocations()) {
         const GenomeLocation *hits, *rcHits;
         GenomeLocation singleHit[2], singleRCHit[2];
-        unliftedIndex->lookupSeed(seed, pnHits, &hits, pnRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
-        *pnHits = nHits;
-        *pnRCHits = nRCHits;
+        unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
 #define HAS_ANY_ALTS \
         if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
             (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \
@@ -1290,12 +1300,12 @@ GenomeIndex::hasAnyAltHits(
                 } \
             } \
             for (int i = 0; i < nRCHits; i++) { \
-                if (unliftedIndex->genome->isAltLocation(hits[i])) { \
+                if (unliftedIndex->genome->isAltLocation(rcHits[i])) { \
                 return true; \
                 } \
             } \
-            return false; \
-        }
+        } \
+        return false;
         HAS_ANY_ALTS
     } else {
         const unsigned *hits, *rcHits;
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index c0897a79..0bcaad1a 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -207,7 +207,7 @@ class GenomeIndex {
 
     static void ComputeBiasTableWorkerThreadMain(void *param);
 
-    static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex, _int64 *pnHits, _int64 *pnRCHits);
+    static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex);
 
     struct OverflowBackpointer;
 
diff --git a/SNAPLib/IntersectingPairedEndAligner.cpp b/SNAPLib/IntersectingPairedEndAligner.cpp
index 92f73883..1c5087df 100644
--- a/SNAPLib/IntersectingPairedEndAligner.cpp
+++ b/SNAPLib/IntersectingPairedEndAligner.cpp
@@ -534,7 +534,6 @@ IntersectingPairedEndAligner::align(
                 lowestFreeScoringMateCandidate[whichSetPair]++;
 
                 previousMoreHitsLocation = lastGenomeLocationForReadWithMoreHits;
-
                 if (!setPair[readWithMoreHits]->getNextLowerHit(&lastGenomeLocationForReadWithMoreHits, &lastSeedOffsetForReadWithMoreHits, pLastUnliftedGenomeLocationForReadWithMoreHits)) {
                     lastGenomeLocationForReadWithMoreHits = 0;
                     outOfMoreHitsLocations = true;
@@ -1385,14 +1384,14 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
 }
 
     bool
-        IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation)
+IntersectingPairedEndAligner::HashTableHitSet::getNextLowerHit(
+    GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation)
 {
     //
     // Look through all of the lookups and find the one with the highest location smaller than the current one.
     //
     GenomeLocation foundLocation = 0;
     bool anyFound = false;
-    const bool setUnlifted = unliftedGenomeLocation != NULL;
 
     //
     // Run through the lookups pushing up any that are at the most recently returned
@@ -1401,8 +1400,8 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
     for (unsigned i = 0; i < nLookupsUsed; i++) {
         _int64 *currentHitForIntersection;
         _int64 nHits;
-        GenomeLocation hitLocation;
-        GenomeLocation unliftedHitLocation;
+        GenomeLocation hitLocation = -1;
+        GenomeLocation unliftedHitLocation = -2;
         unsigned seedOffset;
 
         //
@@ -1414,7 +1413,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
         seedOffset = lookups[i].seedOffset;                                                                             \
         if (nHits != *currentHitForIntersection) {                                                                      \
             hitLocation = lookups[i].hits[*currentHitForIntersection];                                                  \
-            if (setUnlifted) {                                                                                          \
+            if (unliftedGenomeLocation != NULL) {                                                                       \
                 unliftedHitLocation = lookups[i].unliftedHits[*currentHitForIntersection];                              \
             }                                                                                                           \
         }
@@ -1436,12 +1435,12 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
             }
             if (doesGenomeIndexHave64BitLocations) {
                 hitLocation = lookups64[i].hits[*currentHitForIntersection];
-                if (setUnlifted) {
+                if (unliftedGenomeLocation != NULL) {
                     unliftedHitLocation = lookups64[i].unliftedHits[*currentHitForIntersection];
                 }
             } else {
                 hitLocation = lookups32[i].hits[*currentHitForIntersection];
-                if (setUnlifted) {
+                if (unliftedGenomeLocation != NULL) {
                     unliftedHitLocation = lookups32[i].unliftedHits[*currentHitForIntersection];
                 }
             }
@@ -1452,7 +1451,7 @@ IntersectingPairedEndAligner::HashTableHitSet::computeBestPossibleScoreForCurren
                 hitLocation >= seedOffset) // found location isn't too small to push us before the beginning of the genome
             {
                 *genomeLocation = foundLocation = hitLocation - seedOffset;
-                if (setUnlifted) {
+                if (unliftedGenomeLocation != NULL) {
                     *unliftedGenomeLocation = unliftedHitLocation - seedOffset;
                 }
                 *seedOffsetFound = seedOffset;
diff --git a/SNAPLib/IntersectingPairedEndAligner.h b/SNAPLib/IntersectingPairedEndAligner.h
index 39c258ef..3884892d 100644
--- a/SNAPLib/IntersectingPairedEndAligner.h
+++ b/SNAPLib/IntersectingPairedEndAligner.h
@@ -229,7 +229,6 @@ class IntersectingPairedEndAligner : public PairedEndAligner
         //
         bool getNextLowerHit(GenomeLocation *genomeLocation, unsigned *seedOffsetFound, GenomeLocation *unliftedGenomeLocation);
 
-
         //
         // Find the highest genome address.
         //

From 919b95962e2e1f52228ea5489147286465037e95 Mon Sep 17 00:00:00 2001
From: Ravi Pandya <ravi@iecommerce.com>
Date: Thu, 14 Jan 2016 10:42:45 -0800
Subject: [PATCH 19/19] Fix rc sort, non-large alt index build

---
 SNAPLib/GenomeIndex.cpp      | 29 +++++++++++++++--------------
 SNAPLib/GenomeIndex.h        |  2 +-
 SNAPLib/SortedDataWriter.cpp | 10 ++++++++++
 3 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/SNAPLib/GenomeIndex.cpp b/SNAPLib/GenomeIndex.cpp
index 12f86df4..06fb64d4 100644
--- a/SNAPLib/GenomeIndex.cpp
+++ b/SNAPLib/GenomeIndex.cpp
@@ -1084,8 +1084,7 @@ GenomeIndex::ComputeBiasTable(const Genome* genome, int seedLen, double* table,
 
 			_ASSERT(seed.getHighBases(hashTableKeySize) < nHashTables);
 
-            bool addSeed = unliftedIndex == NULL || hasAnyAltHits(seed, i, unliftedIndex) ||
-                ((!large) && hasAnyAltHits(~seed, i, unliftedIndex));
+            bool addSeed = unliftedIndex == NULL || unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large);
             if (addSeed && NULL == seedsSeen->GetFirstValueForKey(seed.getBases())) {
                 _uint64 value = 42;
                 seedsSeen->Insert(seed.getBases(), &value);
@@ -1240,8 +1239,7 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
 
 			_ASSERT(whichHashTable < context->nHashTables);
 
-            bool addSeed = context->unliftedIndex == NULL || hasAnyAltHits(seed, i,  context->unliftedIndex) ||
-                ((!large) && hasAnyAltHits(~seed, i, context->unliftedIndex));
+            bool addSeed = context->unliftedIndex == NULL || context->unliftedIndex->hasAnyAltHitsAndLocationIsFirst(seed, i, large);
             if (addSeed && batches[whichHashTable].addSeed(seed.getLowBases(context->hashTableKeySize))) {
                 PerCounterBatch *batch = &batches[whichHashTable];
                 AcquireExclusiveLock(&context->approximateCounterLocks[whichHashTable]);
@@ -1283,25 +1281,28 @@ GenomeIndex::ComputeBiasTableWorkerThreadMain(void *param)
 }
 
 bool
-GenomeIndex::hasAnyAltHits(
-    Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex)
+GenomeIndex::hasAnyAltHitsAndLocationIsFirst(
+    Seed seed, GenomeLocation genomeLocation, bool large) const
 {
     _int64 nHits, nRCHits;
-    if (unliftedIndex->doesGenomeIndexHave64BitLocations()) {
+    if (doesGenomeIndexHave64BitLocations()) {
         const GenomeLocation *hits, *rcHits;
         GenomeLocation singleHit[2], singleRCHit[2];
-        unliftedIndex->lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
+        lookupSeed(seed, &nHits, &hits, &nRCHits, &rcHits, &singleHit[1], &singleRCHit[1]);
 #define HAS_ANY_ALTS \
-        if ((nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
-            (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits))) { \
+        bool isFirst = large \
+            ? (nHits > 0 && genomeLocation == *hits && (nRCHits == 0 || *hits <= *rcHits)) || \
+                (nRCHits > 0 && genomeLocation == *rcHits && (nHits == 0 || *rcHits < *hits)) \
+            : nHits > 0 && genomeLocation == *hits; \
+        if (isFirst) { \
             for (int i = 0; i < nHits; i++) { \
-                if (unliftedIndex->genome->isAltLocation(hits[i])) { \
+                if (genome->isAltLocation(hits[i])) { \
                     return true; \
                 } \
             } \
             for (int i = 0; i < nRCHits; i++) { \
-                if (unliftedIndex->genome->isAltLocation(rcHits[i])) { \
-                return true; \
+                if (genome->isAltLocation(rcHits[i])) { \
+                    return true; \
                 } \
             } \
         } \
@@ -1309,7 +1310,7 @@ GenomeIndex::hasAnyAltHits(
         HAS_ANY_ALTS
     } else {
         const unsigned *hits, *rcHits;
-        unliftedIndex->lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
+        lookupSeed32(seed, &nHits, &hits, &nRCHits, &rcHits);
         HAS_ANY_ALTS
     }
 }
diff --git a/SNAPLib/GenomeIndex.h b/SNAPLib/GenomeIndex.h
index 0bcaad1a..86a28c4f 100644
--- a/SNAPLib/GenomeIndex.h
+++ b/SNAPLib/GenomeIndex.h
@@ -207,7 +207,7 @@ class GenomeIndex {
 
     static void ComputeBiasTableWorkerThreadMain(void *param);
 
-    static bool hasAnyAltHits(Seed seed, GenomeLocation genomeLocation, const GenomeIndex* unliftedIndex);
+    bool hasAnyAltHitsAndLocationIsFirst(Seed seed, GenomeLocation genomeLocation, bool large) const;
 
     struct OverflowBackpointer;
 
diff --git a/SNAPLib/SortedDataWriter.cpp b/SNAPLib/SortedDataWriter.cpp
index 85ad336b..2d8970fd 100644
--- a/SNAPLib/SortedDataWriter.cpp
+++ b/SNAPLib/SortedDataWriter.cpp
@@ -184,6 +184,16 @@ SortedDataFilter::onAdvance(
     GenomeDistance bytes,
     GenomeLocation location)
 {
+    if (location != InvalidGenomeLocation && parent->genome->hasAltContigs()) {
+        // reads mapped to RC alt contigs need to have location flipped so they sort properly
+        const Genome::Contig* c = parent->genome->getContigAtLocation(location);
+        if (c != NULL && c->isAlternateRC) {
+            GenomeLocation rcLocation;
+            GenomeDistance ignore;
+            parent->format->getSortInfo(parent->genome, data, bytes, &rcLocation, &ignore);
+            location = rcLocation;
+        }
+    }
     SortEntry entry(batchOffset, bytes, location);
 #ifdef VALIDATE_SORT
 		if (memcmp(data, "BAM", 3) != 0 && memcmp(data, "@HD", 3) != 0) { // skip header block