-
Notifications
You must be signed in to change notification settings - Fork 243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Refactoring VariantContextComparator #1629
Open
lbergelson
wants to merge
1
commit into
master
Choose a base branch
from
lb_pr_comments
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,19 +11,22 @@ | |
import java.util.Comparator; | ||
import java.util.HashMap; | ||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Set; | ||
import java.util.TreeSet; | ||
|
||
/** | ||
* A Comparator that orders VariantContexts by the ordering of the contigs/chromosomes in the List | ||
* provided at construction time, then by start position with each contig/chromosome. | ||
* provided at construction time, then by start position with each contig/chromosome, then by Ref | ||
* and finally by order agnostic Alt alleles | ||
*/ | ||
public class VariantContextComparator implements Comparator<VariantContext>, Serializable { | ||
private static final long serialVersionUID = 1L; | ||
|
||
public static List<String> getSequenceNameList(final SAMSequenceDictionary dictionary) { | ||
final List<String> list = new ArrayList<String>(); | ||
final List<String> list = new ArrayList<>(); | ||
for (final SAMSequenceRecord record : dictionary.getSequences()) { | ||
list.add(record.getSequenceName()); | ||
} | ||
|
@@ -32,11 +35,20 @@ public static List<String> getSequenceNameList(final SAMSequenceDictionary dicti | |
|
||
// For fast lookup of the contig's index in the contig list | ||
private final Map<String, Integer> contigIndexLookup; | ||
private final Comparator<VariantContext> internalComparator; | ||
|
||
private static Comparator<VariantContext> makeComparator(final Map<String, Integer> contigIndexLookup) { | ||
return Comparator.comparingInt((VariantContext vc) -> contigIndexLookup.get(vc.getContig())) | ||
.thenComparingInt(VariantContext::getStart) | ||
.thenComparingInt(VariantContext::getEnd) | ||
.thenComparing(VariantContext::getReference) | ||
.thenComparing(VariantContext::getAlternateAlleles, new SortedUniquedElementsComparator<>()); | ||
} | ||
|
||
public VariantContextComparator(final List<String> contigs) { | ||
if (contigs.isEmpty()) throw new IllegalArgumentException("One or more contigs must be in the contig list."); | ||
|
||
final Map<String, Integer> protoContigIndexLookup = new HashMap<String, Integer>(); | ||
final Map<String, Integer> protoContigIndexLookup = new HashMap<>(); | ||
int index = 0; | ||
for (final String contig : contigs) { | ||
protoContigIndexLookup.put(contig, index++); | ||
|
@@ -47,6 +59,7 @@ public VariantContextComparator(final List<String> contigs) { | |
} | ||
|
||
this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup); | ||
this.internalComparator = makeComparator(this.contigIndexLookup); | ||
} | ||
|
||
/** | ||
|
@@ -57,7 +70,7 @@ public VariantContextComparator(final List<String> contigs) { | |
public VariantContextComparator(final Collection<VCFContigHeaderLine> headerLines) { | ||
if (headerLines.isEmpty()) throw new IllegalArgumentException("One or more header lines must be in the header line collection."); | ||
|
||
final Map<String, Integer> protoContigIndexLookup = new HashMap<String, Integer>(); | ||
final Map<String, Integer> protoContigIndexLookup = new HashMap<>(); | ||
for (final VCFContigHeaderLine headerLine : headerLines) { | ||
protoContigIndexLookup.put(headerLine.getID(), headerLine.getContigIndex()); | ||
} | ||
|
@@ -66,12 +79,13 @@ public VariantContextComparator(final Collection<VCFContigHeaderLine> headerLine | |
throw new IllegalArgumentException("There are duplicate contigs/chromosomes in the input header line collection."); | ||
} | ||
|
||
final Set<Integer> protoIndexValues = new HashSet<Integer>(protoContigIndexLookup.values()); | ||
final Set<Integer> protoIndexValues = new HashSet<>(protoContigIndexLookup.values()); | ||
if (protoIndexValues.size() != headerLines.size()) { | ||
throw new IllegalArgumentException("One or more contigs share the same index number."); | ||
} | ||
|
||
this.contigIndexLookup = Collections.unmodifiableMap(protoContigIndexLookup); | ||
this.internalComparator = makeComparator(this.contigIndexLookup); | ||
} | ||
|
||
public VariantContextComparator(final SAMSequenceDictionary dictionary) { | ||
|
@@ -83,20 +97,24 @@ public int compare(final VariantContext firstVariantContext, final VariantContex | |
// Will throw NullPointerException -- happily -- if either of the chromosomes/contigs aren't | ||
// present. This error checking should already have been done in the constructor but it's left | ||
// in as defence anyway. | ||
int contigCompare = this.contigIndexLookup.get(firstVariantContext.getContig()).compareTo(this.contigIndexLookup.get(secondVariantContext.getContig())); | ||
contigCompare = contigCompare == 0 ? firstVariantContext.getStart() - secondVariantContext.getStart() : contigCompare; | ||
if (contigCompare == 0) { | ||
// Compare variants that have the same genomic span (chr:start-end) lexicographically by all alleles (ref and alts). | ||
for (int i = 0; i < firstVariantContext.getAlleles().size(); i++) { | ||
// If all previous alleles are identical and the first variant has additional alleles, make the first variant greater. | ||
if (i >= secondVariantContext.getAlleles().size()) { return 1; } | ||
contigCompare = firstVariantContext.getAlleles().get(i).compareTo(secondVariantContext.getAlleles().get(i)); | ||
if (contigCompare != 0) return contigCompare; | ||
return internalComparator.compare(firstVariantContext, secondVariantContext); | ||
} | ||
|
||
//Sorts and uniques the inputs and then compares the values pairwise in their natural order | ||
//until there is a mismatch or 1 of the inputs runs out. | ||
private static class SortedUniquedElementsComparator<T extends Comparable<T>> implements Comparator<Collection<T>>{ | ||
@Override | ||
public int compare(final Collection<T> o1, final Collection<T> o2) { | ||
final Iterator<T> left = new TreeSet<>(o1).iterator(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps is worth to add trivial cases to avoid TreeSet constructons since quite often alt-alleles list are just 1 in length? |
||
final Iterator<T> right = new TreeSet<>(o2).iterator(); | ||
while(left.hasNext() && right.hasNext()){ | ||
final int result = left.next().compareTo(right.next()); | ||
if( result != 0 ){ | ||
return result; | ||
} | ||
} | ||
return Boolean.compare(left.hasNext(), right.hasNext()); | ||
} | ||
// If all previous alleles are identical and the second variant has additional alleles, make the second variant greater. | ||
if (firstVariantContext.getAlleles().size() < secondVariantContext.getAlleles().size()) { return -1; } | ||
return contigCompare; | ||
} | ||
|
||
/** | ||
|
102 changes: 46 additions & 56 deletions
102
src/test/java/htsjdk/variant/variantcontext/VariantContextComparatorUnitTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,75 +1,65 @@ | ||
package htsjdk.variant.variantcontext; | ||
|
||
import htsjdk.HtsjdkTest; | ||
import htsjdk.samtools.util.Locatable; | ||
import htsjdk.tribble.SimpleFeature; | ||
import org.testng.Assert; | ||
import org.testng.annotations.BeforeSuite; | ||
import org.testng.annotations.DataProvider; | ||
import org.testng.annotations.Test; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.Collections; | ||
import java.util.List; | ||
|
||
/** | ||
* Unit tests for VariantContextComparator. | ||
*/ | ||
public class VariantContextComparatorUnitTest extends HtsjdkTest { | ||
private Allele refA, altG, altT; | ||
|
||
@BeforeSuite | ||
public void before() { | ||
refA = Allele.create("A", true); | ||
altG = Allele.create("G", false); | ||
altT = Allele.create("T", false); | ||
public class VariantContextComparatorUnitTest extends HtsjdkTest{ | ||
|
||
private static final String CHR1 = "chr1" , CHR2 = "chr2"; | ||
|
||
@DataProvider | ||
public Object[][] getVCs(){ | ||
final Locatable chr1_1_1 = new SimpleFeature(CHR1, 1, 1); | ||
final Locatable chr1_11_11 = new SimpleFeature(CHR1, 11, 11); | ||
final Locatable chr2_1_1 = new SimpleFeature(CHR2, 1, 1); | ||
final Locatable chr2_10_10 = new SimpleFeature(CHR2, 10,10); | ||
final VariantContext chr1_1_1_A_G = make(chr1_1_1, Allele.REF_A, Allele.ALT_G); | ||
final VariantContext chr2_1_1_A_G = make(chr2_1_1, Allele.REF_A, Allele.ALT_G); | ||
|
||
final VariantContext chr2_10_10_A_G = make(chr2_10_10, Allele.REF_A, Allele.ALT_G); | ||
final VariantContext chr1_11_11_A_G = make(chr1_11_11, Allele.REF_A, Allele.ALT_G); | ||
final VariantContext chr1_1_A = make(chr1_1_1, Allele.REF_A); | ||
return new Object[][] { | ||
{chr1_1_1_A_G, chr1_1_1_A_G, 0}, //identical should match | ||
{chr1_1_1_A_G, chr2_1_1_A_G, -1}, //chr2 comes after chr1 | ||
{chr1_1_1_A_G, chr2_10_10_A_G, -1}, | ||
{chr1_11_11_A_G, chr2_10_10_A_G, -1}, //chr before pos | ||
{chr1_11_11_A_G, chr1_1_1_A_G, 1}, | ||
{chr1_1_1_A_G, make(chr1_1_1, Allele.REF_A, Allele.ALT_T), -1}, //alleles sort lexicographichally | ||
{make(chr1_1_1, Allele.REF_A, Allele.ALT_G, Allele.ALT_T), chr1_1_1_A_G, 1},//extra alleles sort after fewer | ||
{chr1_1_1_A_G, make(chr1_1_1, Allele.REF_G, Allele.ALT_A), -1}, //mismatch ref takes precendence | ||
{make(chr1_1_1, Allele.REF_T, Allele.ALT_C, Allele.create("AAA"), Allele.ALT_G), // allele order doesn't matter | ||
make(chr1_1_1, Allele.REF_T, Allele.ALT_G, Allele.create("AAA"), Allele.ALT_C), 0}, | ||
{chr1_1_1_A_G, chr1_1_A, 1}, // no alt allele | ||
{chr1_1_A, chr1_1_A, 0} // no alt alleles at all | ||
|
||
}; | ||
} | ||
|
||
@Test | ||
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleIdentical() { | ||
final String contig = "chr1"; | ||
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); | ||
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); | ||
|
||
final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); | ||
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make(); | ||
|
||
final int compare = comparator.compare(variant1, variant2); | ||
Assert.assertEquals(compare, 0); // TODO: What other criteria might we sort by to break this tie? | ||
} | ||
|
||
@Test | ||
public void testVariantContextsWithSameSiteSortLexicographicallyByAllele() { | ||
final String contig = "chr1"; | ||
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); | ||
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); | ||
|
||
final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); | ||
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altT)).make(); | ||
|
||
final int compare = comparator.compare(variant1, variant2); | ||
Assert.assertEquals(compare, -1); | ||
private static VariantContext make(Locatable pos, Allele ref, Allele ... alt){ | ||
final List<Allele> alleles = new ArrayList<>(); | ||
alleles.add(ref); | ||
alleles.addAll(Arrays.asList(alt)); | ||
return new VariantContextBuilder("test", pos.getContig(), pos.getStart(), pos.getEnd(), alleles).make(); | ||
} | ||
|
||
@Test | ||
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForFirstVariant() { | ||
final String contig = "chr1"; | ||
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); | ||
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); | ||
|
||
final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG, altT)).make(); | ||
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG)).make(); | ||
|
||
final int compare = comparator.compare(variant1, variant2); | ||
Assert.assertEquals(compare, 1); | ||
} | ||
|
||
@Test | ||
public void testVariantContextsWithSameSiteSortLexicographicallyByAlleleThenExtraAllelesForSecondVariant() { | ||
final String contig = "chr1"; | ||
final VariantContextComparator comparator = new VariantContextComparator(Collections.singletonList(contig)); | ||
final VariantContextBuilder builder = new VariantContextBuilder("test", contig, 1, 1, Collections.emptyList()); | ||
|
||
final VariantContext variant1 = builder.alleles(Arrays.asList(refA, altG)).make(); | ||
final VariantContext variant2 = builder.alleles(Arrays.asList(refA, altG, altT)).make(); | ||
|
||
final int compare = comparator.compare(variant1, variant2); | ||
Assert.assertEquals(compare, -1); | ||
@Test(dataProvider = "getVCs") | ||
public void testVariantContextComparator(VariantContext left, VariantContext right, int expectedResult){ | ||
final VariantContextComparator comparator = new VariantContextComparator(Arrays.asList(CHR1, CHR2)); | ||
Assert.assertEquals(comparator.compare(left, right), expectedResult); | ||
Assert.assertEquals(comparator.compare(right, left), -expectedResult); // make sure it's symmetrical | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since you are using a lambda to determine the index given the contig names perhaps that is what you should compose in the constructors:
If the constructor param is a SAMSeqDict then (x) -> dict.getRecord(x).getIndex().
If the constructor param is a List then you create that Map.
Also you can add here the NPE capture into a IAE (or whatever) for missing contigs.