From 086bc0db61847fe3efa84c5080c36cb2f0dbf831 Mon Sep 17 00:00:00 2001 From: Oliver Holmes Date: Tue, 27 Feb 2024 14:52:11 +1000 Subject: [PATCH 1/2] refactor(qannotate nanno): use long to store chr and position when interrogating annotation position The hope is that by creating fewer objects, it will be quicker (and simpler) when comparing positions in qanannotate's nanno --- build.gradle | 4 + .../au/edu/qimr/qannotate/nanno/Annotate.java | 672 +++++++++--------- .../qimr/qannotate/nanno/AnnotateUtils.java | 549 +++++++------- .../qannotate/nanno/AnnotationInputs.java | 26 +- .../qannotate/nanno/AnnotationSource.java | 553 ++++++++------ .../nanno/AnnotationSourceSnpEffVCF.java | 540 +++++++------- .../qannotate/nanno/AnnotationSourceTSV.java | 27 +- .../qannotate/nanno/AnnotationSourceVCF.java | 24 +- .../qimr/qannotate/nanno/AnnotateTest.java | 34 +- .../nanno/AnnotationSourceTSVTest.java | 36 +- .../qannotate/nanno/AnnotationSourceTest.java | 282 +++++--- .../common/model/ChrPositionComparator.java | 362 ++++++---- .../qcmg/common/util/ChrPositionUtils.java | 114 +-- .../org/qcmg/common/util/TabTokenizer.java | 346 ++++----- .../src/org/qcmg/common/vcf/VcfRecord.java | 20 +- .../model/ChrPositionComparatorTest.java | 26 +- .../common/util/ChrPositionUtilsTest.java | 11 + qio/src/org/qcmg/qio/record/RecordReader.java | 157 ++-- .../org/qcmg/qio/record/StringFileReader.java | 5 +- qsignature/src/org/qcmg/sig/Generate.java | 10 +- 20 files changed, 2028 insertions(+), 1770 deletions(-) diff --git a/build.gradle b/build.gradle index 442c68c5e..03ff7202c 100644 --- a/build.gradle +++ b/build.gradle @@ -54,6 +54,10 @@ subprojects { checkstyleTest.enabled=false } checkstyleMain.onlyIf {project.hasProperty('checkstyle')} + checkstyleMain { + mustRunAfter test + mustRunAfter compileJava + } dependencies { testImplementation 'junit:junit:4.13.2' diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java index 68a14ebef..5b2a96645 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java @@ -1,371 +1,333 @@ package au.edu.qimr.qannotate.nanno; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Queue; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.Executor; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - +import au.edu.qimr.qannotate.Messages; +import au.edu.qimr.qannotate.Options; import org.qcmg.common.log.QLogger; import org.qcmg.common.log.QLoggerFactory; import org.qcmg.common.meta.QExec; import org.qcmg.common.model.ChrPosition; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.vcf.VcfRecord; import org.qcmg.common.vcf.VcfUtils; import org.qcmg.qio.record.RecordWriter; import org.qcmg.qio.vcf.VcfFileReader; -import au.edu.qimr.qannotate.Messages; -import au.edu.qimr.qannotate.Options; +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.concurrent.*; +import java.util.stream.Collectors; public class Annotate { - - static final List SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/"); - - static Comparator CUSTOM_COMPARATOR; - static QLogger logger; - - private int exitStatus; - - private String logFile; - private String inputFile; - private String outputFile; - private String jsonInputs; - - private QExec exec; - - public int engage() throws Exception { - - /* - * parse the json file into an AnnotationINputs object - */ - AnnotationInputs ais = AnnotateUtils.getInputs(jsonInputs); - logger.info("Number of annotation source threads to use: " + ais.getAnnotationSourceThreadCount()); - /* - * create a comparator that will be used to sort the annotation fields for output - */ - CUSTOM_COMPARATOR = AnnotateUtils.createComparatorFromList(Arrays.stream(ais.getOutputFieldOrder().split(",")).collect(Collectors.toList())); - logger.info("Custom comparator created"); - /* - * check headers that have been supplied in the json inputs file - */ - int headersOK = AnnotateUtils.checkHeaders(ais); - if (headersOK == 1) { - logger.error("Headers have been checked - not OK!!!"); - System.exit(headersOK); - } - logger.info("Headers have been checked - OK"); - - List annotationSources = new ArrayList<>(); - AnnotateUtils.populateAnnotationSources(ais, annotationSources); - logger.info("annotationSources have been loaded (size: " + annotationSources.size() + ")"); - annotationSources.stream().forEach(as -> logger.info(as.toString())); - - CountDownLatch consumerLatch = new CountDownLatch(1); - Queue queue = new ConcurrentLinkedQueue<>(); - - - ExecutorService executor = Executors.newFixedThreadPool(Math.max(ais.getAnnotationSourceThreadCount(), 1) + 1); // need an extra thread for the consumer, and at least 1 other thread - executor.execute(new Consumer(queue, outputFile, consumerLatch, ais, exec)); - logger.info("ExecutorService has been setup"); - - ChrPosition lastCP = null; - try ( - VcfFileReader reader = new VcfFileReader(inputFile);) { - logger.info("VcfFileReader has been setup"); - int vcfCount = 0; - for (VcfRecord vcf : reader) { - vcfCount++; - - ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt(); - logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString()); - - - /* - * check that this CP is "after" the last CP - */ - int compare = null != lastCP ? ((ChrPositionRefAlt)thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; - if (compare < 0) { - throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); - } - - - String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); - String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); - String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); - - if (alt.contains(",")) { - logger.info("alt has comma: " + thisVcfsCP.toString()); - /* - * split record, create new ChrPositions for each - */ - String [] altArray = alt.split(","); - Map altToADMap = AnnotateUtils.getADForSplitVcfRecords(altArray, gatkAD); - List splitVcfs = new ArrayList<>(); - for (String thisAlt : altArray) { - if (thisAlt.equals("*")) { - /* - * ignore - */ - } else { - VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); - splitVcfs.add(newVcf); - } - } - if (splitVcfs.size() > 1) { - /* - * sort - */ - splitVcfs.sort(null); - } - for (VcfRecord splitVcf : splitVcfs) { - List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); - queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); - } - - } else { - - logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); - List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); - logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); - queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); - - } - - lastCP = thisVcfsCP; - } - - logger.info("# of vcf records: " + vcfCount); - } finally { - /* - * count down the count down latch - */ - consumerLatch.countDown(); - } - executor.shutdown(); - executor.awaitTermination(60, TimeUnit.MINUTES); - logger.info("ExecutorService has been shutdown"); - return exitStatus; - } - - - private static List getAnnotationsForPosition(ChrPosition cp, List annotationSources, Executor executor) { - - return annotationSources.stream() - .map(source -> CompletableFuture.supplyAsync(() -> - source.getAnnotation(cp), executor)) - .map(CompletableFuture::join).collect(Collectors.toList()); - } - - public static class ChrPositionAnnotations { - - public String getGatkAD() { - return gatkAD; - } - - public ChrPosition getCp() { - return cp; - } - - public List getAnnotations() { - return annotations; - } - - public String getGatkGT() { - return gatkGT; - } - - public String getOriginalAlt() { - return originalAlt; - } - - ChrPosition cp; - List annotations; - String gatkAD; - String gatkGT; - String originalAlt; - - public ChrPositionAnnotations(ChrPosition cp, List annotations, String gatkAD, String gatkGT, String originalAlt) { - super(); - this.cp = cp; - this.annotations = annotations; - this.gatkAD = gatkAD; - this.gatkGT = gatkGT; - this.originalAlt = originalAlt; - } - - public String toStringMinusAnnotations() { - return ((ChrPositionRefAlt)cp).toTabSeperatedString() + "\t" + originalAlt + "\t" + gatkGT + "\t" + gatkAD; - } - - } - - public static class Consumer implements Runnable { - - private final Queue queue; - private final String outputFile; - private final boolean includeSearchTerm; - private final CountDownLatch latch; - private final RecordWriter writer; - private final String additionalEmptyValues; - private final AnnotationInputs ais; - - public Consumer(Queue queue, String outputFile, CountDownLatch latch, AnnotationInputs ais, QExec exec) throws IOException { - this.queue = queue; - this.outputFile = outputFile; - this.latch = latch; - this.ais = ais; - includeSearchTerm = ais.isIncludeSearchTerm(); - additionalEmptyValues = AnnotateUtils.generateAdditionalEmptyValues(ais); - List headers = AnnotateUtils.generateHeaders(ais, exec); - - writer = new RecordWriter(new File(outputFile)); - for (String h : headers) { - writer.addHeader(h); - } - } - - @Override - public void run() { - logger.info("Consumer thread is a go!"); - try { - - while (true) { - - final ChrPositionAnnotations rec = queue.poll(); - if (null != rec) { - - processRecord(rec); - - } else { - if (latch.getCount() == 0) { - break; - } - // sleep and try again - try { - Thread.sleep(20); - } catch (final InterruptedException e) { - logger.error("InterruptedException caught in Consumer sleep: " + e.getLocalizedMessage()); - throw e; - } finally { - } - } - } - } catch (final Exception e) { - e.printStackTrace(); - logger.error("Exception caught in Consumer class: " + e.getCause().getMessage()); - } finally { - logger.info("Consumer: shutting down"); - /* - * close writer - */ - try { - writer.close(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } - - public void processRecord(final ChrPositionAnnotations recAndAnnotations) throws IOException { - ChrPosition cp = recAndAnnotations.getCp(); - - - List annotations = recAndAnnotations.getAnnotations(); - logger.debug("annotations.size(): " + annotations.size()); - - /* - * collect entries in annotations lists into map - */ - List singleAnnotations = AnnotateUtils.convertAnnotations(annotations); - logger.debug("singleAnnotations.size(): " + singleAnnotations.size()); - - - String searchTerm = ""; - if (includeSearchTerm) { - String hgvsC = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.c").orElse(null); - String hgvsP = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.p").orElse(null); - searchTerm = AnnotateUtils.getSearchTerm(hgvsC, hgvsP); - } - /* - * sort and write out to file - */ - String annotationString = singleAnnotations.stream().map(s -> s.split("=", 2)).sorted(CUSTOM_COMPARATOR).map(a -> a[1]).collect(Collectors.joining("\t")); - - logger.debug("annotationString: " + annotationString); - - writer.add(recAndAnnotations.toStringMinusAnnotations() + "\t" + annotationString + additionalEmptyValues + (includeSearchTerm ? "\t" + searchTerm : "")); - } - } - - public static void main(String[] args) throws Exception { - final Annotate sp = new Annotate(); - int exitStatus = 0; - try { - exitStatus = sp.setup(args); - } catch (final Exception e) { - exitStatus = 1; - if (null != logger) { - logger.error("Exception caught whilst running Annotate:", e); - } else { - System.err.println("Exception caught whilst running Annotate"); - } - e.printStackTrace(); - } - - if (null != logger) { - logger.logFinalExecutionStats(exitStatus); - } - System.exit(exitStatus); - } - - protected int setup(String args[]) throws Exception { - int returnStatus = 1; - if (null == args || args.length == 0) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - System.exit(1); - } - final Options options = new Options(args); - - System.out.println("options.getInputFileName: " + options.getInputFileName()); - System.out.println("options.getOutputFileName: " + options.getOutputFileName()); - System.out.println("options.getOutputFileName: " + options.getOutputFileName()); - System.out.println("options.getConfigFileName: " + options.getConfigFileName()); - if ( null == options.getInputFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getOutputFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getLogFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getConfigFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else { - // configure logging - logFile = options.getLogFileName(); - logger = QLoggerFactory.getLogger(Annotate.class, logFile, options.getLogLevel()); - exec = logger.logInitialExecutionStats("Annotate", Annotate.class.getPackage().getImplementationVersion(), args); - outputFile = options.getOutputFileName(); - inputFile = options.getInputFileName(); - jsonInputs = options.getConfigFileName(); - - return engage(); - } - - return returnStatus; - } + + static final List SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/"); + + static Comparator CUSTOM_COMPARATOR; + static QLogger logger; + + private int exitStatus; + + private String inputFile; + private String outputFile; + private String jsonInputs; + + private QExec exec; + + public int engage() throws Exception { + + /* + * parse the json file into an AnnotationInputs object + */ + AnnotationInputs ais = AnnotateUtils.getInputs(jsonInputs); + logger.info("Number of annotation source threads to use: " + ais.getAnnotationSourceThreadCount()); + /* + * create a comparator that will be used to sort the annotation fields for output + */ + CUSTOM_COMPARATOR = AnnotateUtils.createComparatorFromList(Arrays.stream(ais.getOutputFieldOrder().split(",")).collect(Collectors.toList())); + logger.info("Custom comparator created"); + /* + * check headers that have been supplied in the json inputs file + */ + int headersOK = AnnotateUtils.checkHeaders(ais); + if (headersOK == 1) { + logger.error("Headers have been checked - not OK!!!"); + System.exit(headersOK); + } + logger.info("Headers have been checked - OK"); + + List annotationSources = new ArrayList<>(); + AnnotateUtils.populateAnnotationSources(ais, annotationSources); + logger.info("annotationSources have been loaded (size: " + annotationSources.size() + ")"); + annotationSources.forEach(as -> logger.info(as.toString())); + + CountDownLatch consumerLatch = new CountDownLatch(1); + Queue queue = new ConcurrentLinkedQueue<>(); + + + ExecutorService executor = Executors.newFixedThreadPool(Math.max(ais.getAnnotationSourceThreadCount(), 1) + 1); // need an extra thread for the consumer, and at least 1 other thread + executor.execute(new Consumer(queue, outputFile, consumerLatch, ais, exec)); + logger.info("ExecutorService has been setup"); + + ChrPosition lastCP = null; + try ( + VcfFileReader reader = new VcfFileReader(inputFile)) { + logger.info("VcfFileReader has been setup"); + int vcfCount = 0; + for (VcfRecord vcf : reader) { + vcfCount++; + + ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt(); + logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString()); + + + /* + * check that this CP is "after" the last CP + */ + int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; + if (compare < 0) { + throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); + } + + + String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); + String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); + String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); + + if (alt.contains(",")) { + logger.info("alt has comma: " + thisVcfsCP); + /* + * split record, create new ChrPositions for each + */ + String[] altArray = alt.split(","); + List splitVcfs = new ArrayList<>(); + for (String thisAlt : altArray) { + if (thisAlt.equals("*")) { + /* + * ignore + */ + } else { + VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); + splitVcfs.add(newVcf); + } + } + if (splitVcfs.size() > 1) { + /* + * sort + */ + splitVcfs.sort(null); + } + for (VcfRecord splitVcf : splitVcfs) { + List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); + queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); + } + + } else { + + logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); + List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); + logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); + queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); + + } + + lastCP = thisVcfsCP; + } + + logger.info("# of vcf records: " + vcfCount); + } finally { + /* + * count down the count down latch + */ + consumerLatch.countDown(); + } + executor.shutdown(); + executor.awaitTermination(60, TimeUnit.MINUTES); + logger.info("ExecutorService has been shutdown"); + return exitStatus; + } + + + private static List getAnnotationsForPosition(ChrPosition cp, List annotationSources, Executor executor) { + long contigAndPosition = ((ChrPositionUtils.convertContigAndPositionToLong(cp.getChromosome().startsWith("chr") ? cp.getChromosome().substring(3) : cp.getChromosome(), cp.getStartPosition()))); + return annotationSources.stream() + .map(source -> CompletableFuture.supplyAsync(() -> + source.getAnnotation(contigAndPosition, cp), executor)) + .map(CompletableFuture::join).collect(Collectors.toList()); + } + + public static class ChrPositionAnnotations { + + public List getAnnotations() { + return annotations; + } + + ChrPosition cp; + List annotations; + String gatkAD; + String gatkGT; + String originalAlt; + + public ChrPositionAnnotations(ChrPosition cp, List annotations, String gatkAD, String gatkGT, String originalAlt) { + super(); + this.cp = cp; + this.annotations = annotations; + this.gatkAD = gatkAD; + this.gatkGT = gatkGT; + this.originalAlt = originalAlt; + } + + public String toStringMinusAnnotations() { + return ((ChrPositionRefAlt) cp).toTabSeperatedString() + "\t" + originalAlt + "\t" + gatkGT + "\t" + gatkAD; + } + + } + + public static class Consumer implements Runnable { + + private final Queue queue; + private final boolean includeSearchTerm; + private final CountDownLatch latch; + private final RecordWriter writer; + private final String additionalEmptyValues; + + public Consumer(Queue queue, String outputFile, CountDownLatch latch, AnnotationInputs ais, QExec exec) throws IOException { + this.queue = queue; + this.latch = latch; + includeSearchTerm = ais.isIncludeSearchTerm(); + additionalEmptyValues = AnnotateUtils.generateAdditionalEmptyValues(ais); + List headers = AnnotateUtils.generateHeaders(ais, exec); + + writer = new RecordWriter<>(new File(outputFile)); + for (String h : headers) { + writer.addHeader(h); + } + } + + @Override + public void run() { + logger.info("Consumer thread is a go!"); + try { + + while (true) { + + final ChrPositionAnnotations rec = queue.poll(); + if (null != rec) { + + processRecord(rec); + + } else { + if (latch.getCount() == 0) { + break; + } + // sleep and try again + try { + Thread.sleep(20); + } catch (final InterruptedException e) { + logger.error("InterruptedException caught in Consumer sleep: " + e.getLocalizedMessage()); + throw e; + } + } + } + } catch (final Exception e) { + e.printStackTrace(); + logger.error("Exception caught in Consumer class: " + e.getCause().getMessage()); + } finally { + logger.info("Consumer: shutting down"); + /* + * close writer + */ + try { + writer.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + public void processRecord(final ChrPositionAnnotations recAndAnnotations) throws IOException { + + List annotations = recAndAnnotations.getAnnotations(); + logger.debug("annotations.size(): " + annotations.size()); + + /* + * collect entries in annotations lists into map + */ + List singleAnnotations = AnnotateUtils.convertAnnotations(annotations); + logger.debug("singleAnnotations.size(): " + singleAnnotations.size()); + + + String searchTerm = ""; + if (includeSearchTerm) { + String hgvsC = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.c").orElse(null); + String hgvsP = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.p").orElse(null); + searchTerm = AnnotateUtils.getSearchTerm(hgvsC, hgvsP); + } + /* + * sort and write out to file + */ + String annotationString = singleAnnotations.stream().map(s -> s.split("=", 2)).sorted(CUSTOM_COMPARATOR).map(a -> a[1]).collect(Collectors.joining("\t")); + + logger.debug("annotationString: " + annotationString); + + writer.add(recAndAnnotations.toStringMinusAnnotations() + "\t" + annotationString + additionalEmptyValues + (includeSearchTerm ? "\t" + searchTerm : "")); + } + } + + public static void main(String[] args) { + final Annotate sp = new Annotate(); + int exitStatus = 0; + try { + exitStatus = sp.setup(args); + } catch (final Exception e) { + exitStatus = 1; + if (null != logger) { + logger.error("Exception caught whilst running Annotate:", e); + } else { + System.err.println("Exception caught whilst running Annotate"); + } + e.printStackTrace(); + } + + if (null != logger) { + logger.logFinalExecutionStats(exitStatus); + } + System.exit(exitStatus); + } + + protected int setup(String [] args) throws Exception { + int returnStatus = 1; + if (null == args || args.length == 0) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + System.exit(1); + } + final Options options = new Options(args); + + System.out.println("options.getInputFileName: " + options.getInputFileName()); + System.out.println("options.getOutputFileName: " + options.getOutputFileName()); + System.out.println("options.getConfigFileName: " + options.getConfigFileName()); + if (null == options.getInputFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getOutputFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getLogFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getConfigFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else { + // configure logging + String logFile = options.getLogFileName(); + logger = QLoggerFactory.getLogger(Annotate.class, logFile, options.getLogLevel()); + exec = logger.logInitialExecutionStats("Annotate", Annotate.class.getPackage().getImplementationVersion(), args); + outputFile = options.getOutputFileName(); + inputFile = options.getInputFileName(); + jsonInputs = options.getConfigFileName(); + + return engage(); + } + + return returnStatus; + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java index 05f5e5a95..c454893e4 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java @@ -1,21 +1,8 @@ package au.edu.qimr.qannotate.nanno; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - +import au.edu.qimr.qannotate.nanno.AnnotationInputs.AnnotationInput; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; import org.qcmg.common.log.QLogger; import org.qcmg.common.log.QLoggerFactory; import org.qcmg.common.meta.QExec; @@ -23,273 +10,271 @@ import org.qcmg.common.util.TabTokenizer; import org.qcmg.qio.record.StringFileReader; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import au.edu.qimr.qannotate.nanno.AnnotationInputs.AnnotationInput; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; public class AnnotateUtils { - - public static final QLogger logger = QLoggerFactory.getLogger(AnnotateUtils.class); - - public static AnnotationInputs getInputs(String file) throws IOException { - //read json file data to String - byte[] jsonData = Files.readAllBytes(Paths.get(file)); - //create ObjectMapper instance - ObjectMapper objectMapper = new ObjectMapper(); - objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - //convert json string to object - AnnotationInputs ai = objectMapper.readValue(jsonData, AnnotationInputs.class); - - return ai; - } - - public static Comparator createComparatorFromList(final List sortedList) { - Comparator c = new Comparator() { - @Override - public int compare(String[] o1, String[] o2) { - final int index1 = sortedList.indexOf(o1[0]); - if (index1 == -1) return 1; - final int index2 = sortedList.indexOf(o2[0]); - if (index2 == -1) return -1; - return index1 - index2; - } - }; - return c; - } - - /** - * @param ais - * @param annotationSources - * @throws IOException - */ - public static void populateAnnotationSources(AnnotationInputs ais, List annotationSources) throws IOException { - for (AnnotationInput ai : ais.getInputs()) { - String fileName = ai.getFile(); - String fieldNames = ai.getFields(); - - logger.info("fileName: " + fileName + ", positions: " + ai.getChrIndex() + ", " + ai.getPositionIndex() + ", " + ai.getRefIndex() + ", " + ai.getAltIndex() + ", fieldNames: " + fieldNames); - - if (ai.isSnpEffVcf()) { - annotationSources.add(new AnnotationSourceSnpEffVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } else if (fileName.contains("vcf")) { - annotationSources.add(new AnnotationSourceVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } else { - annotationSources.add(new AnnotationSourceTSV(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } - } - } - - public static int checkHeaders(AnnotationInputs ais) { - List annotationFields = ais.getInputs().stream().map(ai -> ai.getFields()).collect(Collectors.toList()); - boolean headersValid = AnnotateUtils.isOrderedHeaderListValid(ais.getOutputFieldOrder(), annotationFields.toArray(new String[]{})); - - if ( ! headersValid) { - System.err.println("headers are not valid! OrderedHeader: " + ais.getOutputFieldOrder() + "\nAnnotation fields: " + (ais.getInputs().stream().map(ai -> ai.getFields())).collect(Collectors.joining(","))); - return 1; - } - return 0; - } - - /** - * checks to see if the sortedHEader contains all the fields from the various annotation sources - * Not sure what to do if we have 2 fields with the same name (presumably from different sources) - * - * - * @param ai - * @return - */ - public static boolean isOrderedHeaderListValid(String sortedHeader, String ... fieldsFromAnnotationSources) { - if (StringUtils.isNullOrEmpty(sortedHeader)) { - /* - * empty or null sorted header - not valid - */ - logger.error("sortedHeader is null or empty"); - return false; - } - if (null == fieldsFromAnnotationSources || fieldsFromAnnotationSources.length == 0) { - /* - * empty or null annotation fields - not valid - */ - logger.error("fieldsFromAnnotationSources is null or length is 0"); - return false; - } - - Set sortedHeaderSet = Arrays.stream(sortedHeader.split(",")).collect(Collectors.toSet()); - Set fieldsFromAnnotationSourcesSet = Arrays.stream(String.join(",", fieldsFromAnnotationSources).split(",")).collect(Collectors.toSet()); - - for (String s : sortedHeaderSet) { - if ( ! fieldsFromAnnotationSourcesSet.contains(s)) { - logger.error(s + " in header but not found in any data source!"); - } - } - for (String s : fieldsFromAnnotationSourcesSet) { - if ( ! sortedHeaderSet.contains(s)) { - logger.error(s + " in data source but not found in header!"); - } - } - - return sortedHeaderSet.containsAll(fieldsFromAnnotationSourcesSet) && fieldsFromAnnotationSourcesSet.containsAll(sortedHeaderSet); - } - - public static String getEmptyHeaderValues(int count) { - if (count <= 0) { - return ""; - } - return org.apache.commons.lang3.StringUtils.repeat("\t", count); - } - - public static int countOccurrences(String s, String t) { - return org.apache.commons.lang3.StringUtils.countMatches(s, t); - } - - /** - * Create a PubMed search term using the hgvsC and hgvsP values - * @param hgvsC - * @param hgvsP - * @return - */ - public static String getSearchTerm(String hgvsC, String hgvsP) { - String st = ""; - - /* - * check the optionals - if they are both not present, no need to proceed - */ - if (( hgvsC == null && hgvsP == null)) { - return st; - } - - if ( hgvsC != null && ! hgvsC.isEmpty()) { - - /* - * need to check that the string contains the dot ('.') and the gt sign ('>') - */ - int dotIndex = hgvsC.indexOf('.'); - int gtIndex = hgvsC.indexOf('>'); - if (dotIndex > -1 && gtIndex > -1) { - - /* - * split value into required parts - */ - String firstPart = hgvsC.substring(dotIndex + 1, gtIndex); - String secondPart = hgvsC.substring(gtIndex + 1); - - st += Annotate.SEARCH_TERM_VARIETIES.stream().map(s -> "\"" + firstPart + s + secondPart + "\"").collect(Collectors.joining("|")); - } - } - - if ( hgvsP != null && ! hgvsP.isEmpty()) { - if ( ! st.isEmpty()) { - /* - * we must have hgvs.c data - so add bar - */ - st += "|"; - } - st += "\"" + hgvsP.substring(hgvsP.indexOf('.') + 1) + "\""; - } - - if ( ! st.isEmpty()) { - return "\"GENE\"+(" + st + ")"; - } - return st; - } - - /** - * Splits the strings in the supplied list by tab, and flattens them to a single list - */ - public static List convertAnnotations(List manyAnnotations) { - if (null != manyAnnotations) { - return manyAnnotations.stream().flatMap(s -> java.util.Arrays.stream(TabTokenizer.tokenize(s))).collect(Collectors.toList()); - } - return Collections.emptyList(); - } - - /** - * get the requiredAnnotation value from the list of annotations - * return null if not present - * - * @param listOfAnnotations - * @param requiredAnnotation - * @return - */ - public static Optional getAnnotationFromList(List listOfAnnotations, String requiredAnnotation) { - - if (null != listOfAnnotations && ! StringUtils.isNullOrEmpty(requiredAnnotation)) { - for (String anno : listOfAnnotations) { - if (anno.startsWith(requiredAnnotation)) { - return Optional.of(anno.substring(requiredAnnotation.length() + 1)); // don't forget the equals sign - } - } - } - return Optional.empty(); - } - - /** - * - * @param altArray - * @param gatkAD - * @return - */ - public static Map getADForSplitVcfRecords(String [] altArray, String gatkAD) { - - Map altToADMap = new HashMap<>(4); - String [] gatkADArray = gatkAD.split(","); - /* - * should have 1 more in the gatkADArray than the altArray - */ - if (altArray.length == gatkADArray.length - 1) { - for (int i = 0 ; i < altArray.length ; i++) { - altToADMap.put(altArray[i], gatkADArray[0] + "," + gatkADArray[i + 1]); - } - } - - return altToADMap; - } - - public static List generateHeaders(AnnotationInputs ais, QExec exec) { - List headers = new ArrayList<>(); - if (null != exec) { - headers.add("##" + exec.getStartTime().toLogString()); - headers.add("##" + exec.getUuid().toLogString()); - headers.add("##" + exec.getHost().toLogString()); - headers.add("##" + exec.getRunBy().toLogString()); - headers.add("##" + exec.getJavaVersion().toLogString()); - headers.add("##" + exec.getToolName().toLogString()); - headers.add("##" + exec.getToolVersion().toLogString()); - headers.add("##" + exec.getCommandLine().toLogString()); - } - if (null != ais && null != ais.getInputs()) { - - for (AnnotationInput ai : ais.getInputs()) { - headers.add("##file:fields\t" + ai.getFile() + ":" + ai.getFields()); - } - - String emptyHeaders = ais.getAdditionalEmptyFields(); - String [] emptyHeadersArray = StringUtils.isNullOrEmpty(emptyHeaders) ? new String[]{} : emptyHeaders.split(","); - String fieldOrder = ais.getOutputFieldOrder(); - String [] fieldOrderArray = StringUtils.isNullOrEmpty(fieldOrder) ? new String[]{} : fieldOrder.split(","); - - String header = "#chr\tposition\tref\talt\toriginal_alt\tGATK_GT\tGATK_AD\t" + Arrays.stream(fieldOrderArray).collect(Collectors.joining("\t")); - if (emptyHeadersArray.length > 0) { - header += "\t" + Arrays.stream(emptyHeadersArray).collect(Collectors.joining("\t")); - } - - boolean includeSearchTerm = ais.isIncludeSearchTerm(); - header += (includeSearchTerm ? "\tsearchTerm" : ""); - headers.add(header); - } - - return headers; - } - - public static String generateAdditionalEmptyValues(AnnotationInputs ais) { - String emptyHeaders = ais.getAdditionalEmptyFields(); - - if (StringUtils.isNullOrEmpty(emptyHeaders)) { - return ""; - } else { - return getEmptyHeaderValues(org.apache.commons.lang3.StringUtils.countMatches(emptyHeaders, ",") + 1); - } - } + + public static final QLogger logger = QLoggerFactory.getLogger(AnnotateUtils.class); + + public static AnnotationInputs getInputs(String file) throws IOException { + //read json file data to String + byte[] jsonData = Files.readAllBytes(Paths.get(file)); + //create ObjectMapper instance + ObjectMapper objectMapper = new ObjectMapper(); + objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + //convert json string to object + + return objectMapper.readValue(jsonData, AnnotationInputs.class); + } + + public static Comparator createComparatorFromList(final List sortedList) { + return (o1, o2) -> { + final int index1 = sortedList.indexOf(o1[0]); + if (index1 == -1) return 1; + final int index2 = sortedList.indexOf(o2[0]); + if (index2 == -1) return -1; + return index1 - index2; + }; + } + + /** + * @param ais + * @param annotationSources + * @throws IOException + */ + public static void populateAnnotationSources(AnnotationInputs ais, List annotationSources) throws IOException { + for (AnnotationInput ai : ais.getInputs()) { + String fileName = ai.getFile(); + String fieldNames = ai.getFields(); + + logger.info("fileName: " + fileName + ", positions: " + ai.getChrIndex() + ", " + ai.getPositionIndex() + ", " + ai.getRefIndex() + ", " + ai.getAltIndex() + ", fieldNames: " + fieldNames); + + if (ai.isSnpEffVcf()) { + annotationSources.add(new AnnotationSourceSnpEffVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } else if (fileName.contains("vcf")) { + annotationSources.add(new AnnotationSourceVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } else { + annotationSources.add(new AnnotationSourceTSV(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } + } + } + + public static int checkHeaders(AnnotationInputs ais) { + List annotationFields = ais.getInputs().stream().map(AnnotationInput::getFields).toList(); + boolean headersValid = AnnotateUtils.isOrderedHeaderListValid(ais.getOutputFieldOrder(), annotationFields.toArray(new String[]{})); + + if (!headersValid) { + System.err.println("headers are not valid! OrderedHeader: " + ais.getOutputFieldOrder() + "\nAnnotation fields: " + (ais.getInputs().stream().map(AnnotationInput::getFields)).collect(Collectors.joining(","))); + return 1; + } + return 0; + } + + /** + * Checks if the ordered list of headers is valid. + * + * @param sortedHeader The sorted header string. + * @param fieldsFromAnnotationSources The fields from annotation sources. + * @return Returns true if the ordered header list is valid, false otherwise. + */ + public static boolean isOrderedHeaderListValid(String sortedHeader, String... fieldsFromAnnotationSources) { + if (StringUtils.isNullOrEmpty(sortedHeader)) { + /* + * empty or null sorted header - not valid + */ + logger.error("sortedHeader is null or empty"); + return false; + } + if (null == fieldsFromAnnotationSources || fieldsFromAnnotationSources.length == 0) { + /* + * empty or null annotation fields - not valid + */ + logger.error("fieldsFromAnnotationSources is null or length is 0"); + return false; + } + + Set sortedHeaderSet = Arrays.stream(sortedHeader.split(",")).collect(Collectors.toSet()); + Set fieldsFromAnnotationSourcesSet = Arrays.stream(String.join(",", fieldsFromAnnotationSources).split(",")).collect(Collectors.toSet()); + + for (String s : sortedHeaderSet) { + if (!fieldsFromAnnotationSourcesSet.contains(s)) { + logger.error(s + " in header but not found in any data source!"); + } + } + for (String s : fieldsFromAnnotationSourcesSet) { + if (!sortedHeaderSet.contains(s)) { + logger.error(s + " in data source but not found in header!"); + } + } + + return sortedHeaderSet.containsAll(fieldsFromAnnotationSourcesSet) && fieldsFromAnnotationSourcesSet.containsAll(sortedHeaderSet); + } + + public static String getEmptyHeaderValues(int count) { + if (count <= 0) { + return ""; + } + return org.apache.commons.lang3.StringUtils.repeat("\t", count); + } + + public static int countOccurrences(String s, String t) { + return org.apache.commons.lang3.StringUtils.countMatches(s, t); + } + + /** + * Create a PubMed search term using the hgvsC and hgvsP values + * + * @param hgvsC + * @param hgvsP + * @return + */ + public static String getSearchTerm(String hgvsC, String hgvsP) { + String st = ""; + + /* + * check the optionals - if they are both not present, no need to proceed + */ + if ((hgvsC == null && hgvsP == null)) { + return st; + } + + if (hgvsC != null && !hgvsC.isEmpty()) { + + /* + * need to check that the string contains the dot ('.') and the gt sign ('>') + */ + int dotIndex = hgvsC.indexOf('.'); + int gtIndex = hgvsC.indexOf('>'); + if (dotIndex > -1 && gtIndex > -1) { + + /* + * split value into required parts + */ + String firstPart = hgvsC.substring(dotIndex + 1, gtIndex); + String secondPart = hgvsC.substring(gtIndex + 1); + + st += Annotate.SEARCH_TERM_VARIETIES.stream().map(s -> "\"" + firstPart + s + secondPart + "\"").collect(Collectors.joining("|")); + } + } + + if (hgvsP != null && !hgvsP.isEmpty()) { + if (!st.isEmpty()) { + /* + * we must have hgvs.c data - so add bar + */ + st += "|"; + } + st += "\"" + hgvsP.substring(hgvsP.indexOf('.') + 1) + "\""; + } + + if (!st.isEmpty()) { + return "\"GENE\"+(" + st + ")"; + } + return st; + } + + /** + * Splits the strings in the supplied list by tab, and flattens them to a single list + */ + public static List convertAnnotations(List manyAnnotations) { + if (null != manyAnnotations) { + return manyAnnotations.stream().flatMap(s -> java.util.Arrays.stream(TabTokenizer.tokenize(s))).collect(Collectors.toList()); + } + return Collections.emptyList(); + } + + /** + * get the requiredAnnotation value from the list of annotations + * return null if not present + * + * @param listOfAnnotations + * @param requiredAnnotation + * @return + */ + public static Optional getAnnotationFromList(List listOfAnnotations, String requiredAnnotation) { + + if (null != listOfAnnotations && !StringUtils.isNullOrEmpty(requiredAnnotation)) { + for (String anno : listOfAnnotations) { + if (anno.startsWith(requiredAnnotation)) { + return Optional.of(anno.substring(requiredAnnotation.length() + 1)); // don't forget the equals sign + } + } + } + return Optional.empty(); + } + + /** + * Retrieves the AD (allele depth) values for split VCF records. + * + * @param altArray An array of alternate alleles. + * @param gatkAD The GATK AD field containing comma-separated allele depth values. + * @return A map of alternate alleles to their corresponding AD values. + */ + public static Map getADForSplitVcfRecords(String[] altArray, String gatkAD) { + + Map altToADMap = new HashMap<>(4); + String[] gatkADArray = gatkAD.split(","); + /* + * should have 1 more in the gatkADArray than the altArray + */ + if (altArray.length == gatkADArray.length - 1) { + for (int i = 0; i < altArray.length; i++) { + altToADMap.put(altArray[i], gatkADArray[0] + "," + gatkADArray[i + 1]); + } + } + + return altToADMap; + } + + public static List generateHeaders(AnnotationInputs ais, QExec exec) { + List headers = new ArrayList<>(); + if (null != exec) { + headers.add("##" + exec.getStartTime().toLogString()); + headers.add("##" + exec.getUuid().toLogString()); + headers.add("##" + exec.getHost().toLogString()); + headers.add("##" + exec.getRunBy().toLogString()); + headers.add("##" + exec.getJavaVersion().toLogString()); + headers.add("##" + exec.getToolName().toLogString()); + headers.add("##" + exec.getToolVersion().toLogString()); + headers.add("##" + exec.getCommandLine().toLogString()); + } + if (null != ais && null != ais.getInputs()) { + + for (AnnotationInput ai : ais.getInputs()) { + headers.add("##file:fields\t" + ai.getFile() + ":" + ai.getFields()); + } + + String emptyHeaders = ais.getAdditionalEmptyFields(); + String[] emptyHeadersArray = StringUtils.isNullOrEmpty(emptyHeaders) ? new String[]{} : emptyHeaders.split(","); + String fieldOrder = ais.getOutputFieldOrder(); + String[] fieldOrderArray = StringUtils.isNullOrEmpty(fieldOrder) ? new String[]{} : fieldOrder.split(","); + + String header = "#chr\tposition\tref\talt\toriginal_alt\tGATK_GT\tGATK_AD\t" + String.join("\t", fieldOrderArray); + if (emptyHeadersArray.length > 0) { + header += "\t" + String.join("\t", emptyHeadersArray); + } + + boolean includeSearchTerm = ais.isIncludeSearchTerm(); + header += (includeSearchTerm ? "\tsearchTerm" : ""); + headers.add(header); + } + + return headers; + } + + public static String generateAdditionalEmptyValues(AnnotationInputs ais) { + String emptyHeaders = ais.getAdditionalEmptyFields(); + + if (StringUtils.isNullOrEmpty(emptyHeaders)) { + return ""; + } else { + return getEmptyHeaderValues(org.apache.commons.lang3.StringUtils.countMatches(emptyHeaders, ",") + 1); + } + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java index cace848dd..64d3d00a5 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java @@ -31,6 +31,8 @@ public int getAnnotationSourceThreadCount() { } +// public record AnnotationInput(String file, int chrIndex, int positionIndex, int refIndex, +// int altIndex, boolean snpEffVcf, String fields, boolean chrStartsWithChr) {} public static class AnnotationInput { private String file; private int chrIndex; @@ -39,34 +41,40 @@ public static class AnnotationInput { private int altIndex; private boolean snpEffVcf; private String fields; - + private boolean chrStartsWithChr; + + public AnnotationInput() {} + public String getFile() { return file; } - + public int getChrIndex() { return chrIndex; } - + public String getFields() { return fields; } - + public int getPositionIndex() { return positionIndex; } - + public int getRefIndex() { return refIndex; } - + public int getAltIndex() { return altIndex; } - + public boolean isSnpEffVcf() { return snpEffVcf; } - - } + + public boolean isChrStartsWithChr() { + return chrStartsWithChr; + } +} } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java index 39cdda7c2..79e5fd12d 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java @@ -14,233 +14,338 @@ import org.qcmg.common.model.ChrPosition; import org.qcmg.common.model.ChrPositionComparator; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.util.TabTokenizer; import org.qcmg.qio.record.RecordReader; public abstract class AnnotationSource implements Closeable { - - public static final String FIELD_DELIMITER_EQ = "="; - public static final char DEFAULT_DELIMITER = '\t'; - public static final String FIELD_DELIMITER_TAB = "\t"; - public static final Comparator COMP = ChrPositionComparator.getStringComparatorForHG38(); - - static final QLogger logger = QLoggerFactory.getLogger(AnnotationSource.class); - - List currentRecords; - List nextRecords; - ChrPosition currentCP; - ChrPosition nextCP; - - protected final RecordReader reader; - protected final Iterator iter; - protected final int chrPositionInRecord; - protected final int positionPositionInRecord; - protected final int refPositionInFile; - protected final int altPositionInFile; - - - public AnnotationSource(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile) { - super(); - this.reader = reader; - this.iter = reader.iterator(); - this.chrPositionInRecord = chrPositionInRecord - 1; - this.positionPositionInRecord = positionPositionInRecord - 1; - this.refPositionInFile = refPositionInFile - 1; - this.altPositionInFile = altPositionInFile - 1; - } - - - public abstract String annotationToReturn(String record); - - public static String getEmptyRecordReturnValue(String fieldNames) { - return Arrays.stream(fieldNames.split(",")).map(s -> s + "=").collect(Collectors.joining(FIELD_DELIMITER_TAB)); - } - - public String getAnnotation(ChrPosition requestedCp) { - - logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); - - - /* - * check to see if the records we currently have stored are a match - */ - if ( areCPsEqual(requestedCp, currentCP) == 0) { - - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturn(rec); - } - } - - } else if (null != nextCP && areCPsEqual(requestedCp, nextCP) < 0) { - /* - * requestedCp is "less than" next CP - * return empty list here - */ - } else { - logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); - getNextRecord(requestedCp); - if ( areCPsEqual(requestedCp, currentCP) == 0) { - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturn(rec); - } - } - } else { - /* - * requestedCP and currentCP are not equal - */ - } - } - return annotationToReturn(null); - } - - void getNextRecord(ChrPosition requestedCp) { - currentRecords = new ArrayList<>(); - - /* - * check to see if the nextCP meets our criteria - * if it does, set currentCP to be next CP - * we still need to iterate through the file to see if we have more than this entry - */ - int nextCPmatch = areCPsEqual(requestedCp, nextCP); - if (nextCPmatch == 0) { - currentCP = nextCP; - currentRecords = nextRecords; - } - - - while (iter.hasNext()) { - String nextRecord = iter.next(); - /* - * check to see if this record is the one we want - */ - String[] nextRecordArray = TabTokenizer.partialTokenize(nextRecord, DEFAULT_DELIMITER, Math.max(chrPositionInRecord, positionPositionInRecord) + 1); - - int match = isThisOurRecord(requestedCp, nextRecordArray, chrPositionInRecord, positionPositionInRecord); - - if (match == 0) { - /* - * got a match! - * we could have more than 1 entry for each position - */ - currentCP = getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); - currentRecords.add(nextRecord); - - } else if (match < 0) { - /* - * we have overshot - set nextCP and break out - */ - nextCP = getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); - nextRecords = new ArrayList<>(); - nextRecords.add(nextRecord); - break; - } else { - /* - * no match yet - keep going - */ - } - } - } - - public static ChrPosition getCpFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord) { - if (null == rec || rec.length == 0) { - throw new IllegalArgumentException("String array rec is null or empty"); - } - if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { - throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); - } - return new ChrPointPosition(rec[chrPositionInRecord], Integer.parseInt(rec[positionPositionInRecord])); - } - - /* - * 1 based numbering - * much like a compare method, this will return 0 if the requestedCp is the same as the rec, - * 1 if the requestedCp is upstream of the rec - * -1 if the requestedCp is downstream of the rec - */ - public static int isThisOurRecord(ChrPosition requestedCp, String[] recArray, int chrPositionInRecord, int positionPositionInRecord) { - - return isThisOurRecord(requestedCp, recArray[chrPositionInRecord], Integer.parseInt(recArray[positionPositionInRecord])); - } - - public static int isThisOurRecord(ChrPosition requestedCp, String recordChr, int recordPosition) { - if (null == requestedCp) { - return 1; - } - return compareChromosomeNameAndStartPositions(requestedCp.getChromosome(), requestedCp.getStartPosition(), recordChr, recordPosition); - } - - /** - * - * Compares chromosome names and positions - */ - public static int compareChromosomeNameAndStartPositions(String chr1, int position1, String chr2, int position2) { - if (null == chr1) { - return 1; - } - if (null == chr2) { - return -1; - } - boolean chr1StartsWithChr = chr1.startsWith("chr"); - boolean chr2StartsWithChr = chr2.startsWith("chr"); - int diff = COMP.compare((chr1StartsWithChr ? chr1.substring(3) : chr1), (chr2StartsWithChr ? chr2.substring(3) : chr2)); - if (diff != 0) { - return diff; - } - /* - * check position now - */ - return Integer.compare(position1, position2); - } - - - /** - * - * THis is effectively comparing the 2 supplied ChrPositions. - * If the first cp is null, 1 is returned. - * If the second cp is null, -1 is returned - * - * NOTE that if both cps supplied are null, 1 is returned (due to the first cp being null)! - * - * If they are both non-null, the contig names and start and end positions are compared - * - * - * @param cp1 - * @param cp2 - * @return - */ - public static int areCPsEqual(ChrPosition cp1 ,ChrPosition cp2) { - if (null == cp1) { - return 1; - } - if (null == cp2) { - return -1; - } - - int nameAndStartPositionMatch = compareChromosomeNameAndStartPositions(cp1.getChromosome(), cp1.getStartPosition(), cp2.getChromosome(), cp2.getStartPosition()); - if (nameAndStartPositionMatch != 0) { - return nameAndStartPositionMatch; - } - - return Integer.compare(cp1.getEndPosition(), cp2.getEndPosition()); - } + + public static final String FIELD_DELIMITER_EQ = "="; + public static final char DEFAULT_DELIMITER = '\t'; + public static final String FIELD_DELIMITER_TAB = "\t"; + public static final Comparator COMP = ChrPositionComparator.getChrNameComparatorNoChrsOneToM(); + + static final QLogger logger = QLoggerFactory.getLogger(AnnotationSource.class); + + List currentRecords; + List nextRecords; + long currentCPAsLong; + long nextCPAsLong; + + + protected final RecordReader reader; + protected final Iterator iter; + protected final int chrPositionInRecord; + protected final int positionPositionInRecord; + protected final int refPositionInFile; + protected final int altPositionInFile; + protected final boolean canUseStartsWith; + + protected final boolean chrStartsWithChr; + + + public AnnotationSource(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, int refPositionInFile, int altPositionInFile, boolean chrStartsWithChr) { + super(); + this.reader = reader; + this.iter = reader.iterator(); + this.chrPositionInRecord = chrPositionInRecord - 1; + this.positionPositionInRecord = positionPositionInRecord - 1; + this.refPositionInFile = refPositionInFile - 1; + this.altPositionInFile = altPositionInFile - 1; + this.canUseStartsWith = this.chrPositionInRecord == 0 && this.positionPositionInRecord == 1; + this.chrStartsWithChr = chrStartsWithChr; + } + + + public abstract String annotationToReturn(String[] record); + + public static String getEmptyRecordReturnValue(String fieldNames) { + return Arrays.stream(fieldNames.split(",")).map(s -> s + FIELD_DELIMITER_EQ).collect(Collectors.joining(FIELD_DELIMITER_TAB)); + } + + public String getAnnotation(long requestedCpAsLong, ChrPosition requestedCp) { + + logger.debug(reader.getFile().getName() + ": requestedCp is " + requestedCpAsLong + ", currentCP: " + currentCPAsLong + ", nextCP: " + nextCPAsLong); + + + /* + * check to see if the records we currently have stored are a match + */ + if (requestedCpAsLong == currentCPAsLong) { + + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ + return getAnnotationsFromCurrentRecords(requestedCp); + + } else { + int matchWithNextCP = Long.compare(requestedCpAsLong, nextCPAsLong); + if (nextCPAsLong > -1 && matchWithNextCP < 0) { + /* + * requestedCp is "less than" next CP + * return empty list here + */ + } else { +// logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); + getNextRecord(requestedCpAsLong, matchWithNextCP); + if (requestedCpAsLong == currentCPAsLong) { + return getAnnotationsFromCurrentRecords(requestedCp); + } + /* + * requestedCP and currentCP are not equal + */ + } + } + return annotationToReturn(null); + } + + private String getAnnotationsFromCurrentRecords(ChrPosition requestedCp) { + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturn(recArray); + } + } + } + return annotationToReturn(null); + } + + void getNextRecord(long requestedCpAsLong, int matchWithNextCP) { +// void getNextRecord(ChrPosition requestedCp, int matchWithNextCP) { + currentRecords = new ArrayList<>(4); + + /* + * check to see if the nextCP meets our criteria + * if it does, set currentCP to be next CP + * we still need to iterate through the file to see if we have more than this entry + */ + if (matchWithNextCP == 0) { + currentCPAsLong = nextCPAsLong; + currentRecords = nextRecords; + } + +// String startsWithString = (chrStartsWithChr ? requestedCp.getChromosome() : requestedCp.getChromosome().substring(3)); + while (iter.hasNext()) { + /* + * check to see if this record is the one we want + */ + String nextRecord = iter.next(); + int match; + String[] nextRecordArray = null; + if (canUseStartsWith) { + match = isThisOurRecordShortcut(requestedCpAsLong, nextRecord, chrStartsWithChr); + } else { + nextRecordArray = TabTokenizer.partialTokenize(nextRecord, DEFAULT_DELIMITER, Math.max(chrPositionInRecord, positionPositionInRecord) + 1); + match = isThisOurRecord(requestedCpAsLong, nextRecordArray, chrPositionInRecord, positionPositionInRecord, chrStartsWithChr); + } + if (match == 0) { + /* + * got a match! + * we could have more than 1 entry for each position + */ + currentCPAsLong = getChrPositionAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, nextRecord, chrStartsWithChr); + currentRecords.add(nextRecord); + + } else if (match < 0) { + /* + * we have overshot - set nextCP and break out + */ + nextCPAsLong = getChrPositionAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, nextRecord, chrStartsWithChr); + nextRecords = new ArrayList<>(); + nextRecords.add(nextRecord); + break; + } + /* + * no match yet - keep going + */ + } + } + + public static ChrPosition getChrPositionFromRecord(String[] nextRecordArray, int chrPositionInRecord, int positionPositionInRecord, String nextRecord) { + if (null == nextRecordArray) { + int firstTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER); + int secondTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1); + return new ChrPointPosition(nextRecord.substring(0, firstTabIndex), Integer.parseInt(nextRecord.substring(firstTabIndex + 1, secondTabIndex))); + } else { + return getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); + } + } + + public static long getChrPositionAsLongFromRecord(String[] nextRecordArray, int chrPositionInRecord, int positionPositionInRecord, String nextRecord, boolean chrStartsWithChr) { + if (null == nextRecordArray) { + int firstTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER); + int secondTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1); + String contig = chrStartsWithChr ? nextRecord.substring(3, firstTabIndex) : nextRecord.substring(0, firstTabIndex); + int position = Integer.parseInt(nextRecord, firstTabIndex + 1, secondTabIndex, 10); + return ChrPositionUtils.convertContigAndPositionToLong(contig, position); + } else { + return getCpAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, chrStartsWithChr); + } + } + + /** + * Determines if the given record is considered our record by comparing the requested chromosome position (cp) with the record's cp. + * The method compares the chromosome and start positions between the requested cp and the record's cp. + * The cp is represented as a long value, where the upper 32 bits represent the chromosome and the lower 32 bits represent the position. + * If the record's cp matches the requested cp or is downstream of the requested cp, 1 is returned. + * If the record's cp is upstream of the requested cp, -1 is returned. + * If the requested cp is -1, indicating a wildcard, 1 is returned. + * + * @param requestedCpAsLong The requested chromosome position as a long value. + * @param recordLine The record line to compare. + * @param chrStartsWithChr Indicates if the chromosome name in the record starts with "chr". + * @return 1 if the record is our record or downstream of the requested cp, -1 if it is upstream, 0 otherwise. + */ + public static int isThisOurRecordShortcut(long requestedCpAsLong, String recordLine, boolean chrStartsWithChr) { + if (requestedCpAsLong == -1) { + return 1; + } + int firstTabIndex = recordLine.indexOf(DEFAULT_DELIMITER); + int recordChrInt = ChrPositionUtils.convertContigNameToInt(recordLine.substring(chrStartsWithChr ? 3 : 0, firstTabIndex)); + + if (recordChrInt == requestedCpAsLong >>> 32) { + // same chromosome, examine the position + int position = Integer.parseInt(recordLine, firstTabIndex + 1, recordLine.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1), 10); + return Integer.compare((int) (requestedCpAsLong & 0x00000000FFFFFFFFL), position); + } else { + // examine the chromosome only + return Integer.compare((int) (requestedCpAsLong >>> 32), recordChrInt); + } + } + + public static ChrPosition getCpFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord) { + if (null == rec || rec.length == 0) { + throw new IllegalArgumentException("String array rec is null or empty"); + } + if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { + throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); + } + return new ChrPointPosition(rec[chrPositionInRecord], Integer.parseInt(rec[positionPositionInRecord])); + } + + public static long getCpAsLongFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord, boolean chrStartsWithChr) { + if (null == rec || rec.length == 0) { + throw new IllegalArgumentException("String array rec is null or empty"); + } + if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { + throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); + } + + String recordChr = chrStartsWithChr ? rec[chrPositionInRecord].substring(3) : rec[chrPositionInRecord]; + int position = Integer.parseInt(rec[positionPositionInRecord]); + return ChrPositionUtils.convertContigAndPositionToLong(recordChr, position); + } + + /* + * 1 based numbering + * much like a compare method, this will return 0 if the requestedCp is the same as the rec, + * 1 if the requestedCp is upstream of the rec + * -1 if the requestedCp is downstream of the rec + */ + public static int isThisOurRecord(long requestedCpAsLong, String[] recArray, int chrPositionInRecord, int positionPositionInRecord, boolean chrStartsWithChr) { + + return isThisOurRecord(requestedCpAsLong, recArray[chrPositionInRecord], Integer.parseInt(recArray[positionPositionInRecord]), chrStartsWithChr); + } + + public static int isThisOurRecord(long requestedCpAsLong, String recordChr, int recordPosition, boolean chrStartsWithChr) { + if (requestedCpAsLong == -1) { + return 1; + } + long recordAsLong = ChrPositionUtils.convertContigAndPositionToLong(chrStartsWithChr ? recordChr.substring(3) : recordChr, recordPosition); + return Long.compare(requestedCpAsLong, recordAsLong); + } + + /** + * Compares the chromosome name and start positions of two variants. + * + * @param chr1 The chromosome name of the first variant. + * @param position1 The start position of the first variant. + * @param chr2 The chromosome name of the second variant. + * @param position2 The start position of the second variant. + * @return 1 if the first variant is greater, -1 if the second variant is greater, 0 if they are equal. + */ + public static int compareChromosomeNameAndStartPositions(String chr1, int position1, String chr2, int position2) { + if (null == chr1) { + return 1; + } + if (null == chr2) { + return -1; + } + boolean chr1StartsWithChr = chr1.startsWith("chr"); + boolean chr2StartsWithChr = chr2.startsWith("chr"); + int diff = COMP.compare((chr1StartsWithChr ? chr1.substring(3) : chr1), (chr2StartsWithChr ? chr2.substring(3) : chr2)); + if (diff != 0) { + return diff; + } + /* + * check position now + */ + return Integer.compare(position1, position2); + } + + + /** + * Compares two ChrPosition objects and determines if they are equal. + * + * @param cp1 The first ChrPosition object to compare. + * @param cp2 The second ChrPosition object to compare. + * @return 0 if the two ChrPosition objects are equal, 1 if cp1 is greater than cp2 or cp2 is null, -1 if cp1 is smaller than cp2 or cp1 is null. + */ + public static int compareCPs(ChrPosition cp1, ChrPosition cp2) { + if (null == cp1) { + return 1; + } + if (null == cp2) { + return -1; + } + + if (areChrPointPositionsEqual(cp1, cp2, true)) { + return 0; + } + + int nameAndStartPositionMatch = compareChromosomeNameAndStartPositions(cp1.getChromosome(), cp1.getStartPosition(), cp2.getChromosome(), cp2.getStartPosition()); + if (nameAndStartPositionMatch != 0) { + return nameAndStartPositionMatch; + } + + return Integer.compare(cp1.getEndPosition(), cp2.getEndPosition()); + } + + /** + * Compares two ChrPosition objects and determines if their chromosome and start positions are equal. + * + * @param cp1 The first ChrPosition object to compare. + * @param cp2 The second ChrPosition object to compare. + * @param ignoreChromosome If true, removes 'chr' from chromosome name (if present) when comparing. + * @return True if the chromosome and start positions are equal, otherwise false. + */ + public static boolean areChrPointPositionsEqual(ChrPosition cp1, ChrPosition cp2, boolean ignoreChromosome) { + if (cp1 == null || cp2 == null) { + return false; + } + if (cp1.getStartPosition() == cp2.getStartPosition()) { + if (cp1.getChromosome().equals(cp2.getChromosome())) { + return true; + } + if (ignoreChromosome) { + boolean cp1StartsWithChr = cp1.getChromosome().startsWith("chr"); + boolean cp2StartsWithChr = cp2.getChromosome().startsWith("chr"); + if ((cp1StartsWithChr && cp2StartsWithChr) || (!cp1StartsWithChr && !cp2StartsWithChr)) { + return false; + } else { + return (cp1StartsWithChr ? cp1.getChromosome().substring(3) : cp1.getChromosome()).equals((cp2StartsWithChr ? cp2.getChromosome().substring(3) : cp2.getChromosome())); + } + } + } + return false; + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java index ae181370b..005a0875f 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java @@ -15,258 +15,294 @@ import org.qcmg.qio.record.RecordReader; public class AnnotationSourceSnpEffVCF extends AnnotationSource { - - public static final String FIELD_DELIMITER_SEMI_COLON = ";"; - - public static final Map SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS = Stream.of( - new SimpleEntry<>("alt", 0), - new SimpleEntry<>("annotation", 1), - new SimpleEntry<>("effect", 1), // annotation is also known as effect - new SimpleEntry<>("putative_impact", 2), - new SimpleEntry<>("gene_name", 3), - new SimpleEntry<>("gene_id", 4), - new SimpleEntry<>("feature_type", 5), - new SimpleEntry<>("feature_id", 6), - new SimpleEntry<>("transcript_biotype", 7), - new SimpleEntry<>("rank", 8), - new SimpleEntry<>("hgvs.c", 9), - new SimpleEntry<>("hgvs.p", 10), - new SimpleEntry<>("cdna_position", 11), - new SimpleEntry<>("cds_position", 12), - new SimpleEntry<>("protein_position", 13), - new SimpleEntry<>("distance_to_feature", 14), - new SimpleEntry<>("errors", 15), - new SimpleEntry<>("warnings", 15), - new SimpleEntry<>("information", 15)).collect(Collectors.toMap(SimpleEntry::getKey, SimpleEntry::getValue)); - - - - List annotationFields; - String emptyRecordResult; - - public AnnotationSourceSnpEffVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); - // TODO Auto-generated constructor stub - - if (StringUtils.isNullOrEmpty(fieldNames)) { - throw new IllegalArgumentException("Null or empty fieldNames parameter passed to AnnotationSourceVCF ctor"); - } - /* - * should check to ensure the header contains the request field names - */ - - annotationFields = Arrays.stream(fieldNames.split(",")).collect(Collectors.toList()); - emptyRecordResult = getEmptyRecordReturnValue(fieldNames); - } - - @Override - public String getAnnotation(ChrPosition requestedCp) { - - logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); - - /* - * check to see if the records we currently have stored are a match - */ - if ( areCPsEqual(requestedCp, currentCP) == 0) { - - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (recAlt.contains(",")) { - String [] recAltArray = recAlt.split(","); - for (String recAltValue : recAltArray) { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAltValue)) { - return annotationToReturnWithAlt(rec, recAltValue); - } - } - } else { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturnWithAlt(rec, recAlt); - } - } - } - - } else if (null != nextCP && areCPsEqual(requestedCp, nextCP) < 0) { - /* - * requestedCp is "less than" next CP - * return empty list here - */ - } else { - logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); - getNextRecord(requestedCp); - if ( areCPsEqual(requestedCp, currentCP) == 0) { - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (recAlt.contains(",")) { - String [] recAltArray = recAlt.split(","); - for (String recAltValue : recAltArray) { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAltValue)) { - return annotationToReturnWithAlt(rec, recAltValue); - } - } - } else { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturnWithAlt(rec, recAlt); - } - } - } - } else { - /* - * requestedCP and currentCP are not equal - */ - } - } - return annotationToReturn(null); - } - - @Override - public String annotationToReturn(String record) { - if (null == record) { - return emptyRecordResult; - } - /* - * dealing with a vcf file and assuming that the required annotation fields are in the INFO field - * so get that and go from there. - */ - String [] recordArray = record.split("\t"); - String info = recordArray[7]; - String alt = recordArray[4]; - - /* - * entries in the INFO field are delimited by ';' - */ - logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); - return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); - } - public String annotationToReturnWithAlt(String record, String alt) { - if (null == record) { - return emptyRecordResult; - } - /* - * dealing with a vcf file and assuming that the required annotation fields are in the INFO field - * so get that and go from there. - */ - String [] recordArray = record.split("\t"); - String info = recordArray[7]; - - /* - * entries in the INFO field are delimited by ';' - */ - logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); - return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); - } - - - public static String extractFieldsFromInfoField(String info, List fields, String emptyInfoFieldResult, String alt) { - if (StringUtils.isNullOrEmptyOrMissingData(info)) { - return emptyInfoFieldResult; - } - - String dataToReturn = ""; - String worstConsequence = getWorstConsequence(info, alt); - - /* - * if we didn't have a match on alt, return the empty result - */ - if (StringUtils.isNullOrEmpty(worstConsequence)) { - return emptyInfoFieldResult; - } - - /* - * we have our consequence - * split by pipe and then get our fields - */ - String [] consequenceArray = TabTokenizer.tokenize(worstConsequence, '|'); - - for (String af : fields) { - if ( ! StringUtils.isNullOrEmpty(af)) { - - /* - * get position from map - */ - String aflc = af.toLowerCase(); - Integer arrayPosition = SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS.get(aflc); - if (null != arrayPosition && arrayPosition >= 0 && arrayPosition < consequenceArray.length) { - /* - * good - */ - String annotation = consequenceArray[arrayPosition]; - dataToReturn += ! dataToReturn.isEmpty() ? FIELD_DELIMITER_TAB + af + "=" + annotation : af + "=" + annotation; - } else { + + public static final String FIELD_DELIMITER_SEMI_COLON = ";"; + + public static final Map SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS = Stream.of( + new SimpleEntry<>("alt", 0), + new SimpleEntry<>("annotation", 1), + new SimpleEntry<>("effect", 1), // annotation is also known as effect + new SimpleEntry<>("putative_impact", 2), + new SimpleEntry<>("gene_name", 3), + new SimpleEntry<>("gene_id", 4), + new SimpleEntry<>("feature_type", 5), + new SimpleEntry<>("feature_id", 6), + new SimpleEntry<>("transcript_biotype", 7), + new SimpleEntry<>("rank", 8), + new SimpleEntry<>("hgvs.c", 9), + new SimpleEntry<>("hgvs.p", 10), + new SimpleEntry<>("cdna_position", 11), + new SimpleEntry<>("cds_position", 12), + new SimpleEntry<>("protein_position", 13), + new SimpleEntry<>("distance_to_feature", 14), + new SimpleEntry<>("errors", 15), + new SimpleEntry<>("warnings", 15), + new SimpleEntry<>("information", 15)).collect(Collectors.toMap(SimpleEntry::getKey, SimpleEntry::getValue)); + + + List annotationFields; + String emptyRecordResult; + + public AnnotationSourceSnpEffVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); + // TODO Auto-generated constructor stub + + if (StringUtils.isNullOrEmpty(fieldNames)) { + throw new IllegalArgumentException("Null or empty fieldNames parameter passed to AnnotationSourceVCF ctor"); + } + /* + * should check to ensure the header contains the request field names + */ + + annotationFields = Arrays.stream(fieldNames.split(",")).collect(Collectors.toList()); + emptyRecordResult = getEmptyRecordReturnValue(fieldNames); + } + + @Override + public String getAnnotation(long requestedCpAsLong, ChrPosition requestedCp) { + +// logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); + + /* + * check to see if the records we currently have stored are a match + */ + if (requestedCpAsLong == currentCPAsLong) { + + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ +// return getAnnotationsFromRecords(requestedCp); + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } + + } else { + int matchWithNextCP = Long.compare(requestedCpAsLong, nextCPAsLong); + if (nextCPAsLong > -1 && matchWithNextCP < 0) { + + } else { + +// logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); + getNextRecord(requestedCpAsLong, matchWithNextCP); + if (requestedCpAsLong == currentCPAsLong) { + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } +// return getAnnotationsFromRecords(requestedCp); + } + /* + * requestedCP and currentCP are not equal + */ + } + } + return annotationToReturn(null); + } + + private String getAnnotationsFromRecords(ChrPosition requestedCp){ + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } + return annotationToReturn(null); + } + + @Override + public String annotationToReturn(String[] record) { + if (null == record) { + return emptyRecordResult; + } + /* + * dealing with a vcf file and assuming that the required annotation fields are in the INFO field + * so get that and go from there. + */ +// String[] recordArray = record.split("\t"); + String info = record[7]; + String alt = record[4]; + + /* + * entries in the INFO field are delimited by ';' + */ + logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); + return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); + } + + public String annotationToReturnWithAlt(String record, String alt) { + if (null == record) { + return emptyRecordResult; + } + /* + * dealing with a vcf file and assuming that the required annotation fields are in the INFO field + * so get that and go from there. + */ + String[] recordArray = record.split("\t"); + String info = recordArray[7]; + + /* + * entries in the INFO field are delimited by ';' + */ + logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); + return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); + } + + + public static String extractFieldsFromInfoField(String info, List fields, String emptyInfoFieldResult, String alt) { + if (StringUtils.isNullOrEmptyOrMissingData(info)) { + return emptyInfoFieldResult; + } + + StringBuilder dataToReturn = new StringBuilder(); + String worstConsequence = getWorstConsequence(info, alt); + + /* + * if we didn't have a match on alt, return the empty result + */ + if (StringUtils.isNullOrEmpty(worstConsequence)) { + return emptyInfoFieldResult; + } + + /* + * we have our consequence + * split by pipe and then get our fields + */ + String[] consequenceArray = TabTokenizer.tokenize(worstConsequence, '|'); + + for (String af : fields) { + if (!StringUtils.isNullOrEmpty(af)) { + + /* + * get position from map + */ + String aflc = af.toLowerCase(); + Integer arrayPosition = SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS.get(aflc); + if (null != arrayPosition && arrayPosition >= 0 && arrayPosition < consequenceArray.length) { + /* + * good + */ + String annotation = consequenceArray[arrayPosition]; + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + af + "=" + annotation : af + "=" + annotation); + } else { // System.out.println("Could not find field [" + af + "] in SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS map!"); // System.out.println("arrayPosition.intValue(): " + arrayPosition.intValue() + ", consequenceArray.length: " + consequenceArray.length); - } - - } - } - return dataToReturn.isEmpty() ? emptyInfoFieldResult : dataToReturn; - } - - /** - * @param info - * @param alt - * @return - */ - public static String getWorstConsequence(String info, String alt) { - /* - * SnpEff annotations are in the following format: - * ANN=|||||||,|||||||,|||||||| - * ie. a comma separated (ordered) list of consequences, which in turn are pipe delimited and contain the following columns: - * alt|effect|Putative_impact| - * - * - * - * snpEff sorts consequences as follows: - * Effect sort order. When multiple effects are reported, SnpEff sorts the effects the following way: - - * Putative impact: Effects having higher putative impact are first. - * Effect type: Effects assumed to be more deleterious effects first. - * Canonical transcript before non-canonical. - * Marker genomic coordinates (e.g. genes starting before first). - * - * - */ - - /* - * first get the consequence corresponding to this alt - * There will most likely be more than 1 - * Pick the first one as that is the one with the highest effect as decreed by snpEff - */ - int annoIndex = info.indexOf("ANN="); - int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, annoIndex); - String ann = info.substring(annoIndex + 4, end == -1 ? info.length() : end); - - - String [] annArray = ann.split(","); - String worstConsequence = ""; - for (String aa : annArray) { - if (aa.startsWith(alt)) { - worstConsequence = aa; - break; - } - } - return worstConsequence; - } - - @Override - public void close() throws IOException { - if (null != reader) { - reader.close(); - } - } - + } + + } + } + return (dataToReturn.isEmpty()) ? emptyInfoFieldResult : dataToReturn.toString(); + } + + /** + * @param info + * @param alt + * @return + */ + public static String getWorstConsequence(String info, String alt) { + /* + * SnpEff annotations are in the following format: + * ANN=|||||||,|||||||,|||||||| + * ie. a comma separated (ordered) list of consequences, which in turn are pipe delimited and contain the following columns: + * alt|effect|Putative_impact| + * + * + * + * snpEff sorts consequences as follows: + * Effect sort order. When multiple effects are reported, SnpEff sorts the effects the following way: + + * Putative impact: Effects having higher putative impact are first. + * Effect type: Effects assumed to be more deleterious effects first. + * Canonical transcript before non-canonical. + * Marker genomic coordinates (e.g. genes starting before first). + * + * + */ + + /* + * first get the consequence corresponding to this alt + * There will most likely be more than 1 + * Pick the first one as that is the one with the highest effect as decreed by snpEff + */ + int annoIndex = info.indexOf("ANN="); + int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, annoIndex); + String ann = info.substring(annoIndex + 4, end == -1 ? info.length() : end); + + + String[] annArray = ann.split(","); + String worstConsequence = ""; + for (String aa : annArray) { + if (aa.startsWith(alt)) { + worstConsequence = aa; + break; + } + } + return worstConsequence; + } + + @Override + public void close() throws IOException { + if (null != reader) { + reader.close(); + } + } + } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java index b08036e81..3c050499c 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java @@ -20,8 +20,8 @@ public class AnnotationSourceTSV extends AnnotationSource { Map headerNameAndPosition; public AnnotationSourceTSV(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); // TODO Auto-generated constructor stub if (StringUtils.isNullOrEmpty(fieldNames)) { @@ -58,12 +58,12 @@ public static String getLastHeaderLine(List headerLines) { /* * easy */ - header = headerLines.get(0); + header = headerLines.getFirst(); } else if (headerLines.size() > 1) { /* * going to assume that the last line contains the header line */ - header = headerLines.get(headerLines.size() - 1); + header = headerLines.getLast(); } return header; } @@ -88,8 +88,8 @@ public static Map getHeaderNameAndPositions(String fieldNames, } @Override - public String annotationToReturn(String record) { - if (null == record) { + public String annotationToReturn(String[] record) { + if (null == record || record.length == 0) { return emptyRecordResult; } @@ -99,20 +99,21 @@ public String annotationToReturn(String record) { return extractFieldsFromRecord(record, headerNameAndPosition); } - public static String extractFieldsFromRecord(String record, Map fields) { - String dataToReturn = ""; - if ( ! StringUtils.isNullOrEmpty(record) && null != fields) { - String [] recordArray = TabTokenizer.tokenize(record); + public static String extractFieldsFromRecord(String[] record, Map fields) { + StringBuilder dataToReturn = new StringBuilder(); + int recordLength = null != record ? record.length : 0; + if ( recordLength > 0 && null != fields) { +// String [] recordArray = TabTokenizer.tokenize(record); for (Entry entry : fields.entrySet()) { /* * make sure that array length is not shorter than entry value */ - if (recordArray.length > entry.getValue().intValue()) { - dataToReturn += (dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB : "") + entry.getKey() + "=" + recordArray[entry.getValue().intValue()]; + if (recordLength > entry.getValue()) { + dataToReturn.append(( ! dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB : "").append(entry.getKey()).append("=").append(record[entry.getValue()]); } } } - return dataToReturn; + return dataToReturn.toString(); } @Override diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java index af0cc652c..dc15768ab 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java @@ -18,8 +18,8 @@ public class AnnotationSourceVCF extends AnnotationSource { String emptyRecordResult; public AnnotationSourceVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); // TODO Auto-generated constructor stub if (StringUtils.isNullOrEmpty(fieldNames)) { @@ -34,19 +34,15 @@ public AnnotationSourceVCF(RecordReader reader, int chrPositionInRecord, } @Override - public String annotationToReturn(String record) { - if (null == record) { + public String annotationToReturn(String [] record) { + if (null == record || record.length == 0) { return emptyRecordResult; } /* * dealing with a vcf file and assuming that the required annotation fields are in the INFO field * so get that and go from there. */ - String [] recordArray = record.split("\t"); -// if (recordArray.length <= 8) { -// System.out.println("vcf length <= 8: " + record); -// } - String info = recordArray[7]; + String info = record[7]; /* * entries in the INFO field are delimited by ';' @@ -59,23 +55,23 @@ public static String extractFieldsFromInfoField(String info, List fields if (StringUtils.isNullOrEmptyOrMissingData(info)) { return emptyInfoFieldResult; } - String dataToReturn = ""; + StringBuilder dataToReturn = new StringBuilder(); for (String af : fields) { if ( ! StringUtils.isNullOrEmpty(af)) { int start = info.indexOf(af + "="); if (start > -1) { int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, start); if (end == -1) { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + info.substring(start) : info.substring(start); + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + info.substring(start) : info.substring(start)); } else { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + info.substring(start, end) : info.substring(start, end); + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + info.substring(start, end) : info.substring(start, end)); } } else { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + af + "=" : af + "="; + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + af + "=" : af + "="); } } } - return dataToReturn.length() == 0 ? emptyInfoFieldResult : dataToReturn; + return (dataToReturn.isEmpty()) ? emptyInfoFieldResult : dataToReturn.toString(); } @Override diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java index d422d5d1a..c8334a4f7 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java @@ -1,8 +1,5 @@ package au.edu.qimr.qannotate.nanno; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -19,8 +16,11 @@ import org.junit.rules.TemporaryFolder; import org.qcmg.common.commandline.Executor; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.vcf.header.VcfHeaderUtils; +import static org.junit.Assert.*; + public class AnnotateTest { @@ -31,17 +31,18 @@ public class AnnotateTest { public void jsonInputs() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.vcf"); - createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4, true); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); - assertEquals(true, ais != null); - assertEquals(1, ais.getInputs().size()); + assertTrue(ais != null); + assert ais != null; + assertEquals(1, ais.getInputs().size()); List sources = new ArrayList<>(); AnnotateUtils.populateAnnotationSources(ais, sources); assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 95813205), new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); assertEquals("blah=", annotation); } @@ -49,7 +50,7 @@ public void jsonInputs() throws IOException { public void jsonInputsTSVMissingHeader() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.tsv"); - createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4, true); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); try { @@ -87,7 +88,7 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { // Given valid inputs, the method should process them without throwing exceptions File validInputJson = testFolder.newFile("valid_inputs.json"); File validAnnotationSource = testFolder.newFile("valid_annotation.vcf"); - createJsonInputs(validInputJson, validAnnotationSource, "valid", false, 3, 4); + createJsonInputs(validInputJson, validAnnotationSource, "valid", false, 3, 4, true); // When AnnotationInputs ais = AnnotateUtils.getInputs(validInputJson.getAbsolutePath()); @@ -98,9 +99,9 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { List sources = new ArrayList<>(); AnnotateUtils.populateAnnotationSources(ais, sources); - Assert.assertEquals(1, sources.size()); + assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 95813205), new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); Assert.assertEquals("valid=", annotation); } @@ -108,13 +109,13 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { public void jsonInputsTSV() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.tsv"); - createJsonInputs(inputJson, annotationSource, "aaref,HGVSc_VEP,HGVSp_VEP", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "aaref,HGVSc_VEP,HGVSp_VEP", false, 3, 4, false); createAnnotationFile(annotationSource, true); List sources = new ArrayList<>(2); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); AnnotateUtils.populateAnnotationSources(ais, sources); assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 655652, 655652, "A", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 655652), new ChrPositionRefAlt("chr1", 655652, 655652, "A", "T")); assertEquals("HGVSc_VEP=c.1A>C\tHGVSp_VEP=p.Met1?\taaref=M", annotation); } @@ -139,7 +140,7 @@ public void endToEnd() throws IOException { /* * json inputs - need annotationSource deets */ - createJsonInputs(inputJson, annotationSource, "aaref", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "aaref", false, 3, 4, false); int exitValue = executeTest(inputVcf, inputJson, outputFile, logFile); assertEquals(1, exitValue); @@ -194,7 +195,7 @@ public void endToEndSnpEff() throws IOException { /* * json inputs - need annotationSource deets */ - createJsonInputs(inputJson, snpEffAnnotationSource, "gene_name,feature_id,feature_type,effect,cdna_position,cds_position,protein_position,putative_impact,hgvs.c,hgvs.p", true, 4, 5); + createJsonInputs(inputJson, snpEffAnnotationSource, "gene_name,feature_id,feature_type,effect,cdna_position,cds_position,protein_position,putative_impact,hgvs.c,hgvs.p", true, 4, 5, true); int exitValue = executeTest(inputVcf, inputJson, outputFile, logFile); assertEquals(0, exitValue); @@ -232,7 +233,7 @@ private int executeTest(File inputVcf, File inputJson, File outputFile, File log return 1; } - public static void createJsonInputs(File jsonFile, File annotationFile, String annotationFields, boolean snpEffAnnotationFile, int refPos, int altPos) throws IOException { + public static void createJsonInputs(File jsonFile, File annotationFile, String annotationFields, boolean snpEffAnnotationFile, int refPos, int altPos, boolean chrStartsWithChr) throws IOException { List data = Arrays.asList( "{", "\"outputFieldOrder\": \"" + annotationFields + "\",", @@ -242,6 +243,7 @@ public static void createJsonInputs(File jsonFile, File annotationFile, String a "\"inputs\": [{", "\"file\": \"" + annotationFile.getAbsolutePath() + "\",", "\"snpEffVcf\": " + snpEffAnnotationFile + ",", + "\"chrStartsWithChr\": " + chrStartsWithChr + ",", "\"chrIndex\": 1,", "\"positionIndex\": 2,", "\"refIndex\": " + refPos + ",", diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java index 44415ff0d..c1cc590a1 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java @@ -15,26 +15,26 @@ public class AnnotationSourceTSVTest { @Test public void extractFieldsFromRecord() { assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(null, null)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("", null)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("blah", null)); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{}, null)); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"blah"}, null)); assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(null, new HashMap<>())); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("", new HashMap<>())); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{}, new HashMap<>())); Map fields = new HashMap<>(); - fields.put("foo", Integer.valueOf(0)); - assertEquals("foo=short_record", AnnotationSourceTSV.extractFieldsFromRecord("short_record", fields)); - assertEquals("foo=slightly_longer", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord", fields)); - fields.put("foo", Integer.valueOf(10)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord", fields)); - assertEquals("foo=", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\t", fields)); - assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(2)); - assertEquals("foo=bar\tfoo2=", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(1)); - assertEquals("foo=bar\tfoo2=record", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(11)); - assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo", Integer.valueOf(100)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); + fields.put("foo", 0); + assertEquals("foo=short_record", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"short_record"}, fields)); + assertEquals("foo=slightly_longer", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record"}, fields)); + fields.put("foo", 10); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record"}, fields)); + assertEquals("foo=", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", ""}, fields)); + assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 2); + assertEquals("foo=bar\tfoo2=", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 1); + assertEquals("foo=bar\tfoo2=record", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 11); + assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo", 100); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record","","","","","","","","","bar"}, fields)); } diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java index 29319d2a0..c4e20d5fa 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java @@ -4,6 +4,7 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.qcmg.common.model.ChrPointPosition; import org.qcmg.common.model.ChrPosition; import org.qcmg.common.util.ChrPositionUtils; @@ -14,114 +15,189 @@ import static org.junit.Assert.*; public class AnnotationSourceTest { - - @Rule - public final TemporaryFolder testFolder = new TemporaryFolder(); - - - @Test - public void compareNameAndPositions() { - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "chr1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "chr1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 1)); - assertEquals(-1, Integer.compare(1, 2)); - assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 2)); - assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "2", 2)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("2", 2, "2", 2)); - assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("2", 3, "2", 2)); - assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("3", 32, "2", 2)); - } - - @Test - public void compareCPs() { - ChrPosition cp1 = null; - ChrPosition cp2 = null; - assertEquals(1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp1 = ChrPositionUtils.getChrPosition("chr1", 1, 1); - assertEquals(-1, AnnotationSource.areCPsEqual(cp1, cp2)); - cp2 = ChrPositionUtils.getChrPosition("chr1", 1, 1); - assertEquals(0, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr1", 2, 2); - assertEquals(-1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp1 = ChrPositionUtils.getChrPosition("chr1", 3, 3); - assertEquals(1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr1", 3, 3); - assertEquals(0, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr3", 3, 3); - assertEquals(-2, AnnotationSource.areCPsEqual(cp1, cp2)); - } - - @Test - public void isThisOurRecord() { - ChrPosition cp1 = ChrPositionUtils.getChrPosition("chr10", 1, 1); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, null, 0)); - assertEquals(1, AnnotationSource.isThisOurRecord(null, null, 0)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr10", 0)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr10", 2)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr11", 1)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr9", 1)); - assertEquals(0, AnnotationSource.isThisOurRecord(cp1, "chr10", 1)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr10", 10)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr10", 0)); - } - - @Test - public void getCPFromStringArray() { - try { + + @Rule + public final TemporaryFolder testFolder = new TemporaryFolder(); + + + @Test + public void compareNameAndPositions() { + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "chr1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "chr1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 1)); + assertEquals(-1, Integer.compare(1, 2)); + assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 2)); + assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "2", 2)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("2", 2, "2", 2)); + assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("2", 3, "2", 2)); + assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("3", 32, "2", 2)); + } + + @Test + public void testIsThisOurRecordShortcut() { + ChrPosition cp = new ChrPointPosition("chr1", 1000); + long cpAsLong = ChrPositionUtils.convertContigAndPositionToLong("chr1", 1000); + + // Match chr, within bounds + int result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t1000\tATGC", true); + assertEquals(0, result); + + // Match chr, but out of bounds + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t2500\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t500\tATGC", true); + Assert.assertEquals(1, result); + + // No chr match + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t500\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t1000\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t1500\tATGC", true); + assertEquals(-1, result); + + // Null ChrPosition + result = AnnotationSource.isThisOurRecordShortcut(-1, "chr1\t1500\tATGC", true); + assertEquals(1, result); + + cp = new ChrPointPosition("chr10", 246987); + cpAsLong = ChrPositionUtils.convertContigAndPositionToLong("chr10", 246987); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr9\t138122079\tATGC", true); + assertEquals(1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t1000\tATGC", true); + assertEquals(1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t246987\tATGC", true); + assertEquals(0, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t2469870\tATGC", true); + assertEquals(-1, result); + } + + @Test + public void compareCPs() { + ChrPosition cp1 = null; + ChrPosition cp2 = null; + assertEquals(1, AnnotationSource.compareCPs(cp1, cp2)); + + cp1 = ChrPositionUtils.getChrPosition("chr1", 1, 1); + assertEquals(-1, AnnotationSource.compareCPs(cp1, cp2)); + cp2 = ChrPositionUtils.getChrPosition("chr1", 1, 1); + assertEquals(0, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr1", 2, 2); + assertEquals(-1, AnnotationSource.compareCPs(cp1, cp2)); + + cp1 = ChrPositionUtils.getChrPosition("chr1", 3, 3); + assertEquals(1, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr1", 3, 3); + assertEquals(0, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr3", 3, 3); + assertEquals(-2, AnnotationSource.compareCPs(cp1, cp2)); + } + + @Test + public void isThisOurRecord() { + ChrPosition cp1 = ChrPositionUtils.getChrPosition("chr10", 1, 1); + long l = ChrPositionUtils.convertContigAndPositionToLong("10", 1); +// assertEquals(-1, AnnotationSource.isThisOurRecord(l, null, 0, false)); + assertEquals(1, AnnotationSource.isThisOurRecord(-1, null, 0, false)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr10", 0, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr10", 2, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr11", 1, true)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr9", 1, true)); + assertEquals(0, AnnotationSource.isThisOurRecord(l, "chr10", 1, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr10", 10, true)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr10", 0, true)); + } + + @Test + public void getCPFromStringArray() { + try { assertNull(AnnotationSource.getCpFromRecord(null, 0, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - try { + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + try { assertNull(AnnotationSource.getCpFromRecord(new String[]{}, 0, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - ChrPosition cp = ChrPositionUtils.getChrPosition("1", 1, 1); - assertEquals(cp, AnnotationSource.getCpFromRecord(new String[]{"1"}, 0, 0)); - assertEquals(ChrPositionUtils.getChrPosition("1", 0, 0), AnnotationSource.getCpFromRecord(new String[]{"1", "0"}, 0, 1)); - assertEquals(ChrPositionUtils.getChrPosition("1", 2, 2), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 0, 1)); - assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 1, 0)); - try { - assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 10, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - } - - @Test - public void vcfFields() { - String info = ""; - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("",""), "EMPTY")); - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", Arrays.asList("",""), "EMPTY")); - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", List.of(""), "EMPTY")); - info = "ALLELEID=75079;CLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202;CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided;CLNHGVS=NC_000001.11:g.1232279A>G;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=OMIM_Allelic_Variant:615291.0001;GENEINFO=B3GALT6:126792;MC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant;ORIGIN=1;RS=786200938"; - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of(""), "EMPTY")); - assertEquals("ALLELEID=75079", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of("ALLELEID"), "EMPTY")); - assertEquals("ALLELEID=75079\tCLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("ALLELEID","CLNDISDB"), "EMPTY")); - assertEquals("CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided\tMC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("CLNDN","MC"), "EMPTY")); - info = "ALLELEID=1211496;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.19251559C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=EMC1:23065;ORIGIN=1"; - assertEquals("MC=\tCLNDN=not_provided", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("MC","CLNDN"), "EMPTY")); - } - - @Test - public void tsvGetHeader() { - String header = "#chr pos(1-based) ref alt aaref aaalt rs_dbSNP151 hg19_chr hg19_pos(1-based) hg18_chr hg18_pos(1-based) aapos genename Ensembl_geneid Ensembl_transcriptid Ensembl_proteinid Uniprot_acc Uniprot_entry HGVSc_ANNOVAR HGVSp_ANNOVAR HGVSc_snpEff HGVSp_snpEff HGVSc_VEP HGVSp_VEP APPRIS GENCODE_basic TSL VEP_canonical cds_strand refcodon codonpos codon_degeneracy Ancestral_allele AltaiNeandertal Denisova VindijiaNeandertal SIFT_score SIFT_converted_rankscore SIFT_pred SIFT4G_score SIFT4G_converted_rankscore SIFT4G_pred Polyphen2_HDIV_score Polyphen2_HDIV_rankscore Polyphen2_HDIV_pred Polyphen2_HVAR_score Polyphen2_HVAR_rankscore Polyphen2_HVAR_pred LRT_score LRT_converted_rankscore LRT_pred LRT_Omega MutationTaster_score MutationTaster_converted_rankscore MutationTaster_pred MutationTaster_model MutationTaster_AAE MutationAssessor_score MutationAssessor_rankscore MutationAssessor_pred FATHMM_score FATHMM_converted_rankscore FATHMM_pred PROVEAN_score PROVEAN_converted_rankscore PROVEAN_pred VEST4_score VEST4_rankscore MetaSVM_score MetaSVM_rankscore MetaSVM_pred MetaLR_score MetaLR_rankscore MetaLR_pred Reliability_index M-CAP_score M-CAP_rankscore M-CAP_pred REVEL_score REVEL_rankscore MutPred_score MutPred_rankscore MutPred_protID MutPred_AAchange MutPred_Top5features MVP_score MVP_rankscore MPC_score MPC_rankscore PrimateAI_score PrimateAI_rankscore PrimateAI_pred DEOGEN2_score DEOGEN2_rankscore DEOGEN2_pred BayesDel_addAF_score BayesDel_addAF_rankscore BayesDel_addAF_pred BayesDel_noAF_score BayesDel_noAF_rankscore BayesDel_noAF_pred ClinPred_score ClinPred_rankscore ClinPred_pred LIST-S2_score LIST-S2_rankscore LIST-S2_pred Aloft_Fraction_transcripts_affected Aloft_prob_Tolerant Aloft_prob_Recessive Aloft_prob_Dominant Aloft_pred Aloft_Confidence CADD_raw CADD_raw_rankscore CADD_phred CADD_raw_hg19 CADD_raw_rankscore_hg19 CADD_phred_hg19 DANN_score DANN_rankscore fathmm-MKL_coding_score fathmm-MKL_coding_rankscore fathmm-MKL_coding_pred fathmm-MKL_coding_group fathmm-XF_coding_score fathmm-XF_coding_rankscore fathmm-XF_coding_pred Eigen-raw_coding Eigen-raw_coding_rankscore Eigen-phred_coding Eigen-PC-raw_coding Eigen-PC-raw_coding_rankscore Eigen-PC-phred_coding GenoCanyon_score GenoCanyon_rankscore integrated_fitCons_score integrated_fitCons_rankscore integrated_confidence_value GM12878_fitCons_score GM12878_fitCons_rankscore GM12878_confidence_value H1-hESC_fitCons_score H1-hESC_fitCons_rankscore H1-hESC_confidence_value HUVEC_fitCons_score HUVEC_fitCons_rankscore HUVEC_confidence_value LINSIGHT LINSIGHT_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore phyloP100way_vertebrate phyloP100way_vertebrate_rankscore phyloP30way_mammalian phyloP30way_mammalian_rankscore phyloP17way_primate phyloP17way_primate_rankscore phastCons100way_vertebrate phastCons100way_vertebrate_rankscore phastCons30way_mammalian phastCons30way_mammalian_rankscore phastCons17way_primatephastCons17way_primate_rankscore SiPhy_29way_pi SiPhy_29way_logOdds SiPhy_29way_logOdds_rankscore bStatistic bStatistic_converted_rankscore 1000Gp3_AC 1000Gp3_AF 1000Gp3_AFR_AC 1000Gp3_AFR_AF 1000Gp3_EUR_AC 1000Gp3_EUR_AF 1000Gp3_AMR_AC 1000Gp3_AMR_AF 1000Gp3_EAS_AC 1000Gp3_EAS_AF 1000Gp3_SAS_AC 1000Gp3_SAS_AF TWINSUK_AC TWINSUK_AF ALSPAC_AC ALSPAC_AF UK10K_AC UK10K_AF ESP6500_AA_AC ESP6500_AA_AF ESP6500_EA_AC ESP6500_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF ExAC_nonTCGA_AC ExAC_nonTCGA_AF ExAC_nonTCGA_Adj_AC ExAC_nonTCGA_Adj_AF ExAC_nonTCGA_AFR_AC ExAC_nonTCGA_AFR_AF ExAC_nonTCGA_AMR_AC ExAC_nonTCGA_AMR_AF ExAC_nonTCGA_EAS_AC ExAC_nonTCGA_EAS_AF ExAC_nonTCGA_FIN_AC ExAC_nonTCGA_FIN_AF ExAC_nonTCGA_NFE_AC ExAC_nonTCGA_NFE_AF ExAC_nonTCGA_SAS_AC ExAC_nonTCGA_SAS_AF ExAC_nonpsych_AC ExAC_nonpsych_AF ExAC_nonpsych_Adj_AC ExAC_nonpsych_Adj_AF ExAC_nonpsych_AFR_AC ExAC_nonpsych_AFR_AF ExAC_nonpsych_AMR_AC ExAC_nonpsych_AMR_AF ExAC_nonpsych_EAS_AC ExAC_nonpsych_EAS_AF ExAC_nonpsych_FIN_AC ExAC_nonpsych_FIN_AF ExAC_nonpsych_NFE_AC ExAC_nonpsych_NFE_AF ExAC_nonpsych_SAS_AC ExAC_nonpsych_SAS_AF gnomAD_exomes_flag gnomAD_exomes_AC gnomAD_exomes_AN gnomAD_exomes_AF gnomAD_exomes_nhomalt gnomAD_exomes_AFR_AC gnomAD_exomes_AFR_AN gnomAD_exomes_AFR_AF gnomAD_exomes_AFR_nhomalt gnomAD_exomes_AMR_AC gnomAD_exomes_AMR_AN gnomAD_exomes_AMR_AF gnomAD_exomes_AMR_nhomalt gnomAD_exomes_ASJ_AC gnomAD_exomes_ASJ_AN gnomAD_exomes_ASJ_AF gnomAD_exomes_ASJ_nhomalt gnomAD_exomes_EAS_AC gnomAD_exomes_EAS_AN gnomAD_exomes_EAS_AF gnomAD_exomes_EAS_nhomalt gnomAD_exomes_FIN_AC gnomAD_exomes_FIN_AN gnomAD_exomes_FIN_AF gnomAD_exomes_FIN_nhomalt gnomAD_exomes_NFE_AC gnomAD_exomes_NFE_AN gnomAD_exomes_NFE_AF gnomAD_exomes_NFE_nhomalt gnomAD_exomes_SAS_AC gnomAD_exomes_SAS_AN gnomAD_exomes_SAS_AF gnomAD_exomes_SAS_nhomalt gnomAD_exomes_POPMAX_AC gnomAD_exomes_POPMAX_AN gnomAD_exomes_POPMAX_AF gnomAD_exomes_POPMAX_nhomalt gnomAD_exomes_controls_AC gnomAD_exomes_controls_AN gnomAD_exomes_controls_AF gnomAD_exomes_controls_nhomalt gnomAD_exomes_controls_AFR_AC gnomAD_exomes_controls_AFR_AN gnomAD_exomes_controls_AFR_AF gnomAD_exomes_controls_AFR_nhomalt gnomAD_exomes_controls_AMR_AC gnomAD_exomes_controls_AMR_AN gnomAD_exomes_controls_AMR_AF gnomAD_exomes_controls_AMR_nhomalt gnomAD_exomes_controls_ASJ_AC gnomAD_exomes_controls_ASJ_AN gnomAD_exomes_controls_ASJ_AF gnomAD_exomes_controls_ASJ_nhomalt gnomAD_exomes_controls_EAS_AC gnomAD_exomes_controls_EAS_AN gnomAD_exomes_controls_EAS_AF gnomAD_exomes_controls_EAS_nhomalt gnomAD_exomes_controls_FIN_AC gnomAD_exomes_controls_FIN_AN gnomAD_exomes_controls_FIN_AF gnomAD_exomes_controls_FIN_nhomalt gnomAD_exomes_controls_NFE_AC gnomAD_exomes_controls_NFE_AN gnomAD_exomes_controls_NFE_AF gnomAD_exomes_controls_NFE_nhomalt gnomAD_exomes_controls_SAS_AC gnomAD_exomes_controls_SAS_AN gnomAD_exomes_controls_SAS_AF gnomAD_exomes_controls_SAS_nhomalt gnomAD_exomes_controls_POPMAX_AC gnomAD_exomes_controls_POPMAX_AN gnomAD_exomes_controls_POPMAX_AF gnomAD_exomes_controls_POPMAX_nhomalt gnomAD_genomes_flag gnomAD_genomes_AC gnomAD_genomes_AN gnomAD_genomes_AF gnomAD_genomes_nhomalt gnomAD_genomes_AFR_AC gnomAD_genomes_AFR_AN gnomAD_genomes_AFR_AF gnomAD_genomes_AFR_nhomalt gnomAD_genomes_AMR_AC gnomAD_genomes_AMR_AN gnomAD_genomes_AMR_AF gnomAD_genomes_AMR_nhomalt gnomAD_genomes_ASJ_AC gnomAD_genomes_ASJ_AN gnomAD_genomes_ASJ_AF gnomAD_genomes_ASJ_nhomalt gnomAD_genomes_EAS_AC gnomAD_genomes_EAS_AN gnomAD_genomes_EAS_AF gnomAD_genomes_EAS_nhomalt gnomAD_genomes_FIN_AC gnomAD_genomes_FIN_AN gnomAD_genomes_FIN_AF gnomAD_genomes_FIN_nhomalt gnomAD_genomes_NFE_AC gnomAD_genomes_NFE_AN gnomAD_genomes_NFE_AF gnomAD_genomes_NFE_nhomalt gnomAD_genomes_AMI_AC gnomAD_genomes_AMI_AN gnomAD_genomes_AMI_AF gnomAD_genomes_AMI_nhomalt gnomAD_genomes_SAS_AC gnomAD_genomes_SAS_AN gnomAD_genomes_SAS_AF gnomAD_genomes_SAS_nhomalt gnomAD_genomes_POPMAX_AC gnomAD_genomes_POPMAX_AN gnomAD_genomes_POPMAX_AF gnomAD_genomes_POPMAX_nhomalt clinvar_id clinvar_clnsig clinvar_trait clinvar_review clinvar_hgvs clinvar_var_source clinvar_MedGen_id clinvar_OMIM_id clinvar_Orphanet_id Interpro_domain GTEx_V8_gene GTEx_V8_tissueGeuvadis_eQTL_target_gene"; - Map headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr", header); - assertEquals(1, headerMap.size()); + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + ChrPosition cp = ChrPositionUtils.getChrPosition("1", 1, 1); + assertEquals(cp, AnnotationSource.getCpFromRecord(new String[]{"1"}, 0, 0)); + assertEquals(ChrPositionUtils.getChrPosition("1", 0, 0), AnnotationSource.getCpFromRecord(new String[]{"1", "0"}, 0, 1)); + assertEquals(ChrPositionUtils.getChrPosition("1", 2, 2), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 0, 1)); + assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 1, 0)); + try { + assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 10, 0)); + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + } + + @Test + public void vcfFields() { + String info = ""; + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("", ""), "EMPTY")); + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", Arrays.asList("", ""), "EMPTY")); + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", List.of(""), "EMPTY")); + info = "ALLELEID=75079;CLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202;CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided;CLNHGVS=NC_000001.11:g.1232279A>G;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=OMIM_Allelic_Variant:615291.0001;GENEINFO=B3GALT6:126792;MC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant;ORIGIN=1;RS=786200938"; + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of(""), "EMPTY")); + assertEquals("ALLELEID=75079", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of("ALLELEID"), "EMPTY")); + assertEquals("ALLELEID=75079\tCLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("ALLELEID", "CLNDISDB"), "EMPTY")); + assertEquals("CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided\tMC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("CLNDN", "MC"), "EMPTY")); + info = "ALLELEID=1211496;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.19251559C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=EMC1:23065;ORIGIN=1"; + assertEquals("MC=\tCLNDN=not_provided", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("MC", "CLNDN"), "EMPTY")); + } + + @Test + public void tsvGetHeader() { + String header = "#chr pos(1-based) ref alt aaref aaalt rs_dbSNP151 hg19_chr hg19_pos(1-based) hg18_chr hg18_pos(1-based) aapos genename Ensembl_geneid Ensembl_transcriptid Ensembl_proteinid Uniprot_acc Uniprot_entry HGVSc_ANNOVAR HGVSp_ANNOVAR HGVSc_snpEff HGVSp_snpEff HGVSc_VEP HGVSp_VEP APPRIS GENCODE_basic TSL VEP_canonical cds_strand refcodon codonpos codon_degeneracy Ancestral_allele AltaiNeandertal Denisova VindijiaNeandertal SIFT_score SIFT_converted_rankscore SIFT_pred SIFT4G_score SIFT4G_converted_rankscore SIFT4G_pred Polyphen2_HDIV_score Polyphen2_HDIV_rankscore Polyphen2_HDIV_pred Polyphen2_HVAR_score Polyphen2_HVAR_rankscore Polyphen2_HVAR_pred LRT_score LRT_converted_rankscore LRT_pred LRT_Omega MutationTaster_score MutationTaster_converted_rankscore MutationTaster_pred MutationTaster_model MutationTaster_AAE MutationAssessor_score MutationAssessor_rankscore MutationAssessor_pred FATHMM_score FATHMM_converted_rankscore FATHMM_pred PROVEAN_score PROVEAN_converted_rankscore PROVEAN_pred VEST4_score VEST4_rankscore MetaSVM_score MetaSVM_rankscore MetaSVM_pred MetaLR_score MetaLR_rankscore MetaLR_pred Reliability_index M-CAP_score M-CAP_rankscore M-CAP_pred REVEL_score REVEL_rankscore MutPred_score MutPred_rankscore MutPred_protID MutPred_AAchange MutPred_Top5features MVP_score MVP_rankscore MPC_score MPC_rankscore PrimateAI_score PrimateAI_rankscore PrimateAI_pred DEOGEN2_score DEOGEN2_rankscore DEOGEN2_pred BayesDel_addAF_score BayesDel_addAF_rankscore BayesDel_addAF_pred BayesDel_noAF_score BayesDel_noAF_rankscore BayesDel_noAF_pred ClinPred_score ClinPred_rankscore ClinPred_pred LIST-S2_score LIST-S2_rankscore LIST-S2_pred Aloft_Fraction_transcripts_affected Aloft_prob_Tolerant Aloft_prob_Recessive Aloft_prob_Dominant Aloft_pred Aloft_Confidence CADD_raw CADD_raw_rankscore CADD_phred CADD_raw_hg19 CADD_raw_rankscore_hg19 CADD_phred_hg19 DANN_score DANN_rankscore fathmm-MKL_coding_score fathmm-MKL_coding_rankscore fathmm-MKL_coding_pred fathmm-MKL_coding_group fathmm-XF_coding_score fathmm-XF_coding_rankscore fathmm-XF_coding_pred Eigen-raw_coding Eigen-raw_coding_rankscore Eigen-phred_coding Eigen-PC-raw_coding Eigen-PC-raw_coding_rankscore Eigen-PC-phred_coding GenoCanyon_score GenoCanyon_rankscore integrated_fitCons_score integrated_fitCons_rankscore integrated_confidence_value GM12878_fitCons_score GM12878_fitCons_rankscore GM12878_confidence_value H1-hESC_fitCons_score H1-hESC_fitCons_rankscore H1-hESC_confidence_value HUVEC_fitCons_score HUVEC_fitCons_rankscore HUVEC_confidence_value LINSIGHT LINSIGHT_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore phyloP100way_vertebrate phyloP100way_vertebrate_rankscore phyloP30way_mammalian phyloP30way_mammalian_rankscore phyloP17way_primate phyloP17way_primate_rankscore phastCons100way_vertebrate phastCons100way_vertebrate_rankscore phastCons30way_mammalian phastCons30way_mammalian_rankscore phastCons17way_primatephastCons17way_primate_rankscore SiPhy_29way_pi SiPhy_29way_logOdds SiPhy_29way_logOdds_rankscore bStatistic bStatistic_converted_rankscore 1000Gp3_AC 1000Gp3_AF 1000Gp3_AFR_AC 1000Gp3_AFR_AF 1000Gp3_EUR_AC 1000Gp3_EUR_AF 1000Gp3_AMR_AC 1000Gp3_AMR_AF 1000Gp3_EAS_AC 1000Gp3_EAS_AF 1000Gp3_SAS_AC 1000Gp3_SAS_AF TWINSUK_AC TWINSUK_AF ALSPAC_AC ALSPAC_AF UK10K_AC UK10K_AF ESP6500_AA_AC ESP6500_AA_AF ESP6500_EA_AC ESP6500_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF ExAC_nonTCGA_AC ExAC_nonTCGA_AF ExAC_nonTCGA_Adj_AC ExAC_nonTCGA_Adj_AF ExAC_nonTCGA_AFR_AC ExAC_nonTCGA_AFR_AF ExAC_nonTCGA_AMR_AC ExAC_nonTCGA_AMR_AF ExAC_nonTCGA_EAS_AC ExAC_nonTCGA_EAS_AF ExAC_nonTCGA_FIN_AC ExAC_nonTCGA_FIN_AF ExAC_nonTCGA_NFE_AC ExAC_nonTCGA_NFE_AF ExAC_nonTCGA_SAS_AC ExAC_nonTCGA_SAS_AF ExAC_nonpsych_AC ExAC_nonpsych_AF ExAC_nonpsych_Adj_AC ExAC_nonpsych_Adj_AF ExAC_nonpsych_AFR_AC ExAC_nonpsych_AFR_AF ExAC_nonpsych_AMR_AC ExAC_nonpsych_AMR_AF ExAC_nonpsych_EAS_AC ExAC_nonpsych_EAS_AF ExAC_nonpsych_FIN_AC ExAC_nonpsych_FIN_AF ExAC_nonpsych_NFE_AC ExAC_nonpsych_NFE_AF ExAC_nonpsych_SAS_AC ExAC_nonpsych_SAS_AF gnomAD_exomes_flag gnomAD_exomes_AC gnomAD_exomes_AN gnomAD_exomes_AF gnomAD_exomes_nhomalt gnomAD_exomes_AFR_AC gnomAD_exomes_AFR_AN gnomAD_exomes_AFR_AF gnomAD_exomes_AFR_nhomalt gnomAD_exomes_AMR_AC gnomAD_exomes_AMR_AN gnomAD_exomes_AMR_AF gnomAD_exomes_AMR_nhomalt gnomAD_exomes_ASJ_AC gnomAD_exomes_ASJ_AN gnomAD_exomes_ASJ_AF gnomAD_exomes_ASJ_nhomalt gnomAD_exomes_EAS_AC gnomAD_exomes_EAS_AN gnomAD_exomes_EAS_AF gnomAD_exomes_EAS_nhomalt gnomAD_exomes_FIN_AC gnomAD_exomes_FIN_AN gnomAD_exomes_FIN_AF gnomAD_exomes_FIN_nhomalt gnomAD_exomes_NFE_AC gnomAD_exomes_NFE_AN gnomAD_exomes_NFE_AF gnomAD_exomes_NFE_nhomalt gnomAD_exomes_SAS_AC gnomAD_exomes_SAS_AN gnomAD_exomes_SAS_AF gnomAD_exomes_SAS_nhomalt gnomAD_exomes_POPMAX_AC gnomAD_exomes_POPMAX_AN gnomAD_exomes_POPMAX_AF gnomAD_exomes_POPMAX_nhomalt gnomAD_exomes_controls_AC gnomAD_exomes_controls_AN gnomAD_exomes_controls_AF gnomAD_exomes_controls_nhomalt gnomAD_exomes_controls_AFR_AC gnomAD_exomes_controls_AFR_AN gnomAD_exomes_controls_AFR_AF gnomAD_exomes_controls_AFR_nhomalt gnomAD_exomes_controls_AMR_AC gnomAD_exomes_controls_AMR_AN gnomAD_exomes_controls_AMR_AF gnomAD_exomes_controls_AMR_nhomalt gnomAD_exomes_controls_ASJ_AC gnomAD_exomes_controls_ASJ_AN gnomAD_exomes_controls_ASJ_AF gnomAD_exomes_controls_ASJ_nhomalt gnomAD_exomes_controls_EAS_AC gnomAD_exomes_controls_EAS_AN gnomAD_exomes_controls_EAS_AF gnomAD_exomes_controls_EAS_nhomalt gnomAD_exomes_controls_FIN_AC gnomAD_exomes_controls_FIN_AN gnomAD_exomes_controls_FIN_AF gnomAD_exomes_controls_FIN_nhomalt gnomAD_exomes_controls_NFE_AC gnomAD_exomes_controls_NFE_AN gnomAD_exomes_controls_NFE_AF gnomAD_exomes_controls_NFE_nhomalt gnomAD_exomes_controls_SAS_AC gnomAD_exomes_controls_SAS_AN gnomAD_exomes_controls_SAS_AF gnomAD_exomes_controls_SAS_nhomalt gnomAD_exomes_controls_POPMAX_AC gnomAD_exomes_controls_POPMAX_AN gnomAD_exomes_controls_POPMAX_AF gnomAD_exomes_controls_POPMAX_nhomalt gnomAD_genomes_flag gnomAD_genomes_AC gnomAD_genomes_AN gnomAD_genomes_AF gnomAD_genomes_nhomalt gnomAD_genomes_AFR_AC gnomAD_genomes_AFR_AN gnomAD_genomes_AFR_AF gnomAD_genomes_AFR_nhomalt gnomAD_genomes_AMR_AC gnomAD_genomes_AMR_AN gnomAD_genomes_AMR_AF gnomAD_genomes_AMR_nhomalt gnomAD_genomes_ASJ_AC gnomAD_genomes_ASJ_AN gnomAD_genomes_ASJ_AF gnomAD_genomes_ASJ_nhomalt gnomAD_genomes_EAS_AC gnomAD_genomes_EAS_AN gnomAD_genomes_EAS_AF gnomAD_genomes_EAS_nhomalt gnomAD_genomes_FIN_AC gnomAD_genomes_FIN_AN gnomAD_genomes_FIN_AF gnomAD_genomes_FIN_nhomalt gnomAD_genomes_NFE_AC gnomAD_genomes_NFE_AN gnomAD_genomes_NFE_AF gnomAD_genomes_NFE_nhomalt gnomAD_genomes_AMI_AC gnomAD_genomes_AMI_AN gnomAD_genomes_AMI_AF gnomAD_genomes_AMI_nhomalt gnomAD_genomes_SAS_AC gnomAD_genomes_SAS_AN gnomAD_genomes_SAS_AF gnomAD_genomes_SAS_nhomalt gnomAD_genomes_POPMAX_AC gnomAD_genomes_POPMAX_AN gnomAD_genomes_POPMAX_AF gnomAD_genomes_POPMAX_nhomalt clinvar_id clinvar_clnsig clinvar_trait clinvar_review clinvar_hgvs clinvar_var_source clinvar_MedGen_id clinvar_OMIM_id clinvar_Orphanet_id Interpro_domain GTEx_V8_gene GTEx_V8_tissueGeuvadis_eQTL_target_gene"; + Map headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr", header); + assertEquals(1, headerMap.size()); assertTrue(headerMap.containsKey("chr")); assertTrue(headerMap.containsValue(0)); - - headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr,genename", header); - assertEquals(2, headerMap.size()); + + headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr,genename", header); + assertEquals(2, headerMap.size()); assertTrue(headerMap.containsKey("chr")); assertTrue(headerMap.containsValue(0)); assertTrue(headerMap.containsKey("genename")); - assertEquals(Integer.valueOf(12), headerMap.get("genename")); - } - - + assertEquals(Integer.valueOf(12), headerMap.get("genename")); + } + + @Test + public void testAreChrPointPositionsEqual() { + + // test both chromosomes and start positions are identical + ChrPosition chrPos1 = new ChrPointPosition("chr1", 12345); // assumes chr, start, and end as properties + ChrPosition chrPos2 = new ChrPointPosition("chr1", 12345); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + // test for different chromosomes, start positions are identical + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("1", 12345); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + chrPos1 = new ChrPointPosition("1", 12345); + chrPos2 = new ChrPointPosition("1", 12345); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + + // test for different chromosomes, start positions are identical + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("chr2", 12345); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + // test for identical chromosomes, start positions are different + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("chr1", 23456); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + } } diff --git a/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java b/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java index 7c5787377..8fd4f1bd8 100644 --- a/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java +++ b/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java @@ -1,165 +1,221 @@ /** * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. -*/ + */ package org.qcmg.common.model; +import java.io.Serial; import java.io.Serializable; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; import org.qcmg.common.string.StringUtils; import org.qcmg.common.vcf.VcfRecord; -public class ChrPositionComparator implements Comparator, Serializable { - - /** - * not very sure. At moment I accept the old version of this comparator, so always use the default uid. - */ - private static final long serialVersionUID = 1L; - private static final ReferenceNameComparator COMPARATOR = new ReferenceNameComparator(); - public static final List contigs = Collections.unmodifiableList(Arrays.asList("chr1","chr2", "chr3","chr4","chr5","chr6","chr7","chr8","chr9", - "chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY", - "GL000191.1","GL000192.1","GL000193.1","GL000194.1","GL000195.1","GL000196.1","GL000197.1","GL000198.1","GL000199.1", - "GL000200.1","GL000201.1","GL000202.1","GL000203.1","GL000204.1","GL000205.1","GL000206.1","GL000207.1","GL000208.1", - "GL000209.1","GL000210.1","GL000211.1","GL000212.1","GL000213.1","GL000214.1","GL000215.1","GL000216.1","GL000217.1", - "GL000218.1","GL000219.1","GL000220.1","GL000221.1","GL000222.1","GL000223.1","GL000224.1","GL000225.1","GL000226.1", - "GL000227.1","GL000228.1","GL000229.1","GL000230.1","GL000231.1","GL000232.1","GL000233.1","GL000234.1","GL000235.1", - "GL000236.1","GL000237.1","GL000238.1","GL000239.1","GL000240.1","GL000241.1","GL000242.1","GL000243.1","GL000244.1", - "GL000245.1","GL000246.1","GL000247.1","GL000248.1","GL000249.1","chrMT")); - - public static final List HG38_CONTIGS = Collections.unmodifiableList(Arrays.asList("1","2", "3","4","5","6","7","8","9", - "10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","M")); - - @Override - public int compare(ChrPosition o1, ChrPosition o2) { - int chromosomeDiff = COMPARATOR.compare(o1.getChromosome(), o2.getChromosome()); - if (chromosomeDiff != 0) - return chromosomeDiff; - - int positionDiff = o1.getStartPosition() - o2.getStartPosition(); - if (positionDiff != 0) - return positionDiff; - - return o1.getEndPosition() - o2.getEndPosition(); - } - - /** - * Creates a ChrPosition comparator that is based on the chromosome name comparator supplied as an argument. - * Allows the the user to be flexible as to how ChrPosition objects are compared - * @param chrNameComp - * @return - */ - public static Comparator getComparator(Comparator chrNameComp) { - - return Comparator.comparing(ChrPosition::getChromosome, chrNameComp) - .thenComparingInt(ChrPosition::getStartPosition) - .thenComparingInt(ChrPosition::getEndPosition); - } - - - /** - * This method is useful if you have a list of contigs whose order you want to preserve. - * eg. a sorted bam will in its header have a list of contigs, and it is possible that you would like to sort chromosome (Strings) based on this order - * - * If the list is empty of null, then then @link ReferenceNameComparator comparator will be returned. - * - * @param chrNameComp - * @return - */ - public static Comparator getChrNameComparator(List list) { - - return (null == list || list.isEmpty()) ? COMPARATOR : - new Comparator() { - @Override - public int compare(String o1, String o2) { - int i1 = list.indexOf(o1); - int i2 = list.indexOf(o2); - if (i1 >= 0 && i2 >= 0) { - return i1 - i2; - } else if (i1 >= 0 && i2 == -1) { - // o1.chr in list but not o2.chr => o1 < o2 - return -1; - } else if (i1 == -1 && i2 >= 0) { - // o2.chr in list but not o1.chr => o2 < o1 - return 1; - } else { - assert i1 == -1 && i2 == -1; - // neither o1 nor o2 chr in list => "natural" ordering - return o1.compareTo(o2); - } - } - }; - - } - - /** - * Return a comparator for VCF records, preserving the order according to the supplied - * list of contigs. If the CHROM value of record A is in the list but that of record B isn't - * then record A sorts earlier than the record B. If the CHROM value of neither A nor B is in - * the list then the records are sorted according to the "natural" order given by - * `ChrPositionComparator.compare(o1, o2)` - */ - public static Comparator getVcfRecordComparator(List list) { - - return (null == list || list.isEmpty()) ? null : - new Comparator() { - private final ChrPositionComparator chrPosComp = new ChrPositionComparator(); - @Override - public int compare(VcfRecord o1, VcfRecord o2) { - ChrPosition o1Pos = o1.getChrPosition(); - ChrPosition o2Pos = o2.getChrPosition(); - int i1 = list.indexOf(o1Pos.getChromosome()); - int i2 = list.indexOf(o2Pos.getChromosome()); - if (i1 >= 0 && i2 >= 0) { - // o1 & o2 chr in list => order by chr in list then pos - int diff = i1 - i2; - if (diff == 0) { - diff = o1Pos.getStartPosition() - o2Pos.getStartPosition(); - } - return diff; - } else if (i1 >= 0 && i2 == -1) { - // o1.chr in list but not o2.chr => o1 < o2 - return -1; - } else if (i1 == -1 && i2 >= 0) { - // o2.chr in list but not o1.chr => o2 < o1 - return 1; - } else { - assert i1 == -1 && i2 == -1; - // neither o1 nor o2 chr in list => "natural" ordering - return chrPosComp.compare(o1Pos, o2Pos); - } - - } - }; - } - - /** - * Convenience method to return a VCFRecord comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getVcfRecordComparatorForGRCh37() { - return getVcfRecordComparator(contigs); - } - /** - * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getCPComparatorForGRCh37() { - return getComparator(getChrNameComparator(contigs)); - } - /** - * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getStringComparatorForHG38() { - return getChrNameComparator(HG38_CONTIGS); - } +public class ChrPositionComparator implements Comparator, Serializable { + + @Serial + private static final long serialVersionUID = 1L; + private static final ReferenceNameComparator COMPARATOR = new ReferenceNameComparator(); + public static final List contigs = List.of("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "GL000191.1", "GL000192.1", "GL000193.1", "GL000194.1", "GL000195.1", "GL000196.1", "GL000197.1", "GL000198.1", "GL000199.1", "GL000200.1", "GL000201.1", "GL000202.1", "GL000203.1", "GL000204.1", "GL000205.1", "GL000206.1", "GL000207.1", "GL000208.1", "GL000209.1", "GL000210.1", "GL000211.1", "GL000212.1", "GL000213.1", "GL000214.1", "GL000215.1", "GL000216.1", "GL000217.1", "GL000218.1", "GL000219.1", "GL000220.1", "GL000221.1", "GL000222.1", "GL000223.1", "GL000224.1", "GL000225.1", "GL000226.1", "GL000227.1", "GL000228.1", "GL000229.1", "GL000230.1", "GL000231.1", "GL000232.1", "GL000233.1", "GL000234.1", "GL000235.1", "GL000236.1", "GL000237.1", "GL000238.1", "GL000239.1", "GL000240.1", "GL000241.1", "GL000242.1", "GL000243.1", "GL000244.1", "GL000245.1", "GL000246.1", "GL000247.1", "GL000248.1", "GL000249.1", "chrMT"); + public static final Map HG37_CONTIGS_MAP; + + public static final Map HG38_CONTIGS_MAP = Map.ofEntries(Map.entry("1", 1), Map.entry("2", 2), Map.entry("3", 3), Map.entry("4", 4), Map.entry("5", 5), Map.entry("6", 6), Map.entry("7", 7), Map.entry("8", 8), Map.entry("9", 9), Map.entry("10", 10), Map.entry("11", 11), Map.entry("12", 12), Map.entry("13", 13), Map.entry("14", 14), Map.entry("15", 15), Map.entry("16", 16), Map.entry("17", 17), Map.entry("18", 18), Map.entry("19", 19), Map.entry("20", 20), Map.entry("21", 21), Map.entry("22", 22), Map.entry("X", 23), Map.entry("Y", 24), Map.entry("M", 25)); + + static { + Map temp = new HashMap<>(); + for (int i = 1; i <= 22; i++) { + temp.put("chr" + i, i); + } + temp.put("chrX", 23); + temp.put("chrY", 24); + for (int i = 191, j = 25; i <= 249; i++, j++) { + temp.put("GL000" + i + ".1", j); + } + HG37_CONTIGS_MAP = Collections.unmodifiableMap(temp); + } + @Override + public int compare(ChrPosition o1, ChrPosition o2) { + int chromosomeDiff = COMPARATOR.compare(o1.getChromosome(), o2.getChromosome()); + if (chromosomeDiff != 0) return chromosomeDiff; + + int positionDiff = o1.getStartPosition() - o2.getStartPosition(); + if (positionDiff != 0) return positionDiff; + + return o1.getEndPosition() - o2.getEndPosition(); + } + + /** + * Creates a ChrPosition comparator that is based on the chromosome name comparator supplied as an argument. + * Allows the user to be flexible as to how ChrPosition objects are compared + * + */ + public static Comparator getComparator(Comparator chrNameComp) { + + return Comparator.comparing(ChrPosition::getChromosome, chrNameComp).thenComparingInt(ChrPosition::getStartPosition).thenComparingInt(ChrPosition::getEndPosition); + } + + + /** + * This method is useful if you have a list of contigs whose order you want to preserve. + * eg. a sorted bam will in its header have a list of contigs, and it is possible that you would like to sort chromosome (Strings) based on this order + *

+ * If the list is empty of null, then @link ReferenceNameComparator comparator will be returned. + * + */ + public static Comparator getChrNameComparator(List list) { + + return (null == list || list.isEmpty()) ? COMPARATOR : (o1, o2) -> { + int i1 = list.indexOf(o1); + int i2 = list.indexOf(o2); + if (i1 >= 0 && i2 >= 0) { + return i1 - i2; + } else if (i1 >= 0) { + // o1.chr in list but not o2.chr => o1 < o2 + return -1; + } else if (i2 >= 0) { + // o2.chr in list but not o1.chr => o2 < o1 + return 1; + } else { + // neither o1 nor o2 chr in list => "natural" ordering + return o1.compareTo(o2); + } + }; + + } + + /** + * Returns a comparator for sorting chromosome names based on a map of chromosome names and their corresponding positions. + * The comparator sorts the chromosome names based on the positions in the map. If a chromosome name is not present in the map, + * it is treated as having a position of -1 and is sorted after the chromosome names present in the map. + * + * @param map a map of chromosome names and their corresponding positions + * @return a comparator for sorting chromosome names based on the positions in the map + */ + public static Comparator getChrNameComparator(Map map) { + + if (null == map || map.isEmpty()) { + return COMPARATOR; + } + + return (o1, o2) -> { + int i1 = map.getOrDefault(o1, -1); + int i2 = map.getOrDefault(o2, -1); + + if (i1 == -1){ + return (i2 == -1) ? o1.compareTo(o2) : 1; + } + if (i2 == -1){ + return -1; + } + return i1 - i2; + }; + + } + + /** + * Return a comparator for VCF records, preserving the order according to the supplied + * list of contigs. If the CHR value of record A is in the list but that of record B isn't + * then record A sorts earlier than the record B. If the CHR value of neither A nor B is in + * the list then the records are sorted according to the "natural" order given by + * `ChrPositionComparator.compare(o1, o2)` + */ + public static Comparator getVcfRecordComparator(List list) { + + return (null == list || list.isEmpty()) ? null : new Comparator<>() { + private final ChrPositionComparator chrPosComp = new ChrPositionComparator(); + + @Override + public int compare(VcfRecord o1, VcfRecord o2) { + ChrPosition o1Pos = o1.getChrPosition(); + ChrPosition o2Pos = o2.getChrPosition(); + int i1 = list.indexOf(o1Pos.getChromosome()); + int i2 = list.indexOf(o2Pos.getChromosome()); + if (i1 >= 0 && i2 >= 0) { + // o1 & o2 chr in list => order by chr in list then pos + int diff = i1 - i2; + if (diff == 0) { + diff = o1Pos.getStartPosition() - o2Pos.getStartPosition(); + } + return diff; + } else if (i1 >= 0) { + // o1.chr in list but not o2.chr => o1 < o2 + return -1; + } else if (i2 >= 0) { + // o2.chr in list but not o1.chr => o2 < o1 + return 1; + } else { + // neither o1 nor o2 chr in list => "natural" ordering + return chrPosComp.compare(o1Pos, o2Pos); + } + + } + }; + } + + /** + * Convenience method to return a VCFRecord comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB + * + */ + public static Comparator getVcfRecordComparatorForGRCh37() { + return getVcfRecordComparator(contigs); + } + + /** + * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB + * + */ + public static Comparator getCPComparatorForGRCh37() { + return getComparator(getChrNameComparator(HG37_CONTIGS_MAP)); + } + + /** + * Convenience method to return a ChrPosition comparator based on the GRCh38 reference file used at QIMRB + * + */ + public static Comparator getStringComparatorForHG38() { + return getChrNameComparator(HG38_CONTIGS_MAP); + } + + /** + * Returns a comparator for sorting chromosome names, from "1" to "M" (inclusive). + * + * The comparator compares chromosome names based on the following rules: + * - If both names are numeric (e.g., "2", "10"), they are sorted numerically. + * - If one name is numeric and the other is not, the numeric name is sorted first. + * - If both names are non-numeric, they are sorted lexicographically. + * + * @return the chromosome name comparator + */ + public static Comparator getChrNameComparatorNoChrsOneToM() { + + return (o1, o2) -> { + + int i1 = Character.isDigit(o1.charAt(0)) ? Integer.parseInt(o1) : -1; + int i2 = Character.isDigit(o2.charAt(0)) ? Integer.parseInt(o2) : -1; + if (i1 > -1 && i2 > -1) { + return i1 - i2; + } + + if (i1 == -1){ + i1 = o1.equals("X") ? 23 : o1.equals("Y") ? 24 : o1.equals("M") ? 25 : -1; + } + if (i2 == -1){ + i2 = o2.equals("X") ? 23 : o2.equals("Y") ? 24 : o2.equals("M") ? 25 : -1; + } + + if (i1 > -1) { + if (i2 > -1) { + return i1 - i2; + } else { + return -1; + } + } else if (i2 > -1) { + return 1; + } + return o1.compareTo(o2); + }; + + } } diff --git a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java index e6f6a9cc0..cd5c5ae8b 100644 --- a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java +++ b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java @@ -30,6 +30,59 @@ public static boolean doChrPositionsOverlap(ChrPosition a, ChrPosition b) { return doChrPositionsOverlap(a, b, 0); } + /** + * convert ChrPosition to a long. Will only examine the contig and start position + * + */ + public static long convertContigAndPositionToLong(String contig, int position) { + long l = (long) convertContigNameToInt(contig) << 32; + return l + position; + } + + public static ChrPosition convertLongToChrPosition(long l) { + int contig = (int) (l >> 32); + int position = (int) l; + + if (contig == 23) { + return ChrPointPosition.valueOf("X", position); + } else if (contig == 24) { + return ChrPointPosition.valueOf("Y", position); + } else if (contig == 25) { + return ChrPointPosition.valueOf("M", position); + } else if (contig > 25) { + return ChrPointPosition.valueOf("-1", position); + } + return ChrPointPosition.valueOf(Integer.toString(contig), position); + } + + /** + * Converts a contig name to an integer representing the contig. + * Assumes that the contig name is either a number or a string that does not start with "chr". + * + * @param contigName the name of the contig + * @return an integer representation of the contig + */ + public static int convertContigNameToInt(String contigName) { + if (null == contigName || contigName.isEmpty()) { + throw new IllegalArgumentException("null or empty contig name supplied to convertContigNameToInt"); + } + int i = Character.isDigit(contigName.charAt(0)) ? Integer.parseInt(contigName) : -1; + if (i > -1) { + return i; + } + + if (contigName.length() > 3 && contigName.startsWith("chr")) { + return convertContigNameToInt(contigName.substring(3)); + } + + return switch (contigName) { + case "X" -> 23; + case "Y" -> 24; + case "M" -> 25; + default -> contigName.hashCode(); + }; + } + /** * Checks if two ChrPosition objects overlap with a buffer. * @@ -103,8 +156,8 @@ public static ChrPosition createCPFromCosmic(String cosmicCoords) { } else { int colonIndex = cosmicCoords.indexOf(':'); int minusIndex = cosmicCoords.indexOf('-'); - int start = Integer.parseInt(cosmicCoords.substring(colonIndex + 1, minusIndex)); - int end = Integer.parseInt(cosmicCoords.substring(minusIndex + 1)); + int start = Integer.parseInt(cosmicCoords, colonIndex + 1, minusIndex, 10); + int end = Integer.parseInt(cosmicCoords, minusIndex + 1, cosmicCoords.length(), 10); return getChrPosition("chr" + cosmicCoords.substring(0, colonIndex), start, end); } } @@ -167,7 +220,6 @@ public static ChrPosition cloneWithNewChromosomeName(ChrPosition cp, String newC /** * Converts a string in the format "chr1:12345-12345" to a ChrRangePosition object. - * The string must represent a range on the chromosome (start position does not equal end position). * * @param position the string to convert * @return the corresponding ChrRangePosition object @@ -185,8 +237,8 @@ public static ChrRangePosition getChrPositionFromString(String position) { } String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); + int start = Integer.parseInt(position, colonPos + 1, minusPos, 10); + int end = Integer.parseInt(position, minusPos + 1, position.length(), 10); return new ChrRangePosition(chr, start, end); } @@ -212,41 +264,12 @@ public static ChrPositionName getChrPositionNameFromString(String position, Stri } String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); + int start = Integer.parseInt(position, colonPos + 1, minusPos, 10); + int end = Integer.parseInt(position, minusPos + 1, position.length(), 10); return new ChrPositionName(chr, start, end, name); } - /** - * Converts a string in the format "chr1:12345-12345" to a ChrPointPosition object. - * The string must represent a single point on the chromosome (start position equals end position). - * - * @param position the string to convert - * @return the corresponding ChrPointPosition object - * @throws IllegalArgumentException if the string is null, empty, not in the correct format, or represents a range rather than a single point - */ - public static ChrPointPosition getChrPointPositionFromString(String position) { - if (StringUtils.isNullOrEmpty(position)) - throw new IllegalArgumentException("Null or empty string passed to getChrPositionFromString()"); - - int colonPos = position.indexOf(':'); - int minusPos = position.indexOf('-'); - - if (colonPos == -1 || minusPos == -1) { - throw new IllegalArgumentException("invalid string passed to getChrPositionFromString() - must be in chr1:12345-23456 format: " + position); - } - - String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); - if (start != end) { - throw new IllegalArgumentException("Start and end position in getChrPointPositionFromString are not the same. Start: " + start + ", end: " + end + ", from string: " + position); - } - - return ChrPointPosition.valueOf(chr, start); - } - /** * Returns a new ChrPosition object that precedes the given ChrPosition. * The start and end positions of the new ChrPosition are each one less than the corresponding positions of the given ChrPosition. @@ -258,25 +281,6 @@ public static ChrPosition getPrecedingChrPosition(ChrPosition cp) { return new ChrRangePosition(cp.getChromosome(), cp.getStartPosition() - 1, cp.getEndPosition() - 1); } - /** - * Returns a map of ChrPointPosition objects based on the contents of the supplied String array - * - * @param positions - * @return - */ - public static Map getChrPointPositionsFromStrings(String[] positions) { - - if (null == positions || positions.length == 0) - throw new IllegalArgumentException("null or empty string array passed to getChrPositionsFromStrings"); - - Map chrPositions = new HashMap<>(); - for (String s : positions) { - ChrPosition cpp = getChrPointPositionFromString(s); - chrPositions.put(cpp, cpp); - } - return chrPositions; - } - /** * Converts a ChrPosition and additional data to a VCF string. * diff --git a/qcommon/src/org/qcmg/common/util/TabTokenizer.java b/qcommon/src/org/qcmg/common/util/TabTokenizer.java index 3ab20a7ef..34e659093 100644 --- a/qcommon/src/org/qcmg/common/util/TabTokenizer.java +++ b/qcommon/src/org/qcmg/common/util/TabTokenizer.java @@ -1,7 +1,7 @@ /** * © Copyright The University of Queensland 2010-2014. * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. */ package org.qcmg.common.util; @@ -12,177 +12,177 @@ import java.util.NoSuchElementException; public class TabTokenizer { - - private static final char DELIM = '\t'; - private static final String[] stringArrayType = new String[] {}; - - public static String[] tokenize(final String data) { - return tokenize(data, DELIM); - } - public static String[] tokenize(final String data, int requiredEntries) { - return tokenize(data, DELIM, requiredEntries); - } - - public static String[] tokenize(final String data, final char delim) { - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(); - - resultList.add(data.substring(currentIndex, nextIndex)); - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while (nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - // get last string - resultList.add(data.substring(currentIndex)); - - return resultList.toArray(stringArrayType); - } - - public static String[] tokenize(final String data, final char delim, final int requiredEntries) { - int noOfEntries = 0; - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(); - - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while ((noOfEntries <= requiredEntries) && nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - // get last string - resultList.add(data.substring(currentIndex)); - - return resultList.toArray(stringArrayType); - } - - public static String[] partialTokenize(final String data, final char delim, final int requiredEntries) { - int noOfEntries = 0; - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(requiredEntries + 1); - - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while ((noOfEntries < requiredEntries) && nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - - return resultList.toArray(stringArrayType); - } - - public static String[] tokenizeCharAt(final String data) { - return tokenizeCharAt(data, DELIM); - } - - public static String[] tokenizeCharAt(final String data, final char delim) { - final List resultList = new ArrayList(); - - int i=0; - int length = data.length(); - while (i<=length) { - int start = i; - while (i, Iterator { - private final String data; - private final char delim; - private int nextIndex; - private int currentIndex = 0; - private String next; - private boolean lastRecord = false; - - public Iter(String data, char delim) { - this.data = data; - this.delim = delim; - readNext(); - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - return null != next; - } - - @Override - public String next() { - if ( ! hasNext()) - throw new NoSuchElementException(); - - String s = next; - readNext(); - return s; - } - - private void readNext() { - if (lastRecord) { - next = null; - return; - } - nextIndex = data.indexOf(delim, currentIndex); - if (nextIndex < 0) { - lastRecord = true; - next = data.substring(currentIndex); - } else { - next = data.substring(currentIndex, nextIndex); - currentIndex = nextIndex + 1; - } - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - } + + private static final char DELIM = '\t'; + private static final String[] stringArrayType = new String[]{}; + + public static String[] tokenize(final String data) { + return tokenize(data, DELIM); + } + + public static String[] tokenize(final String data, int requiredEntries) { + return tokenize(data, DELIM, requiredEntries); + } + + public static String[] tokenize(final String data, final char delim) { + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(); + + resultList.add(data.substring(currentIndex, nextIndex)); + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while (nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + // get last string + resultList.add(data.substring(currentIndex)); + + return resultList.toArray(stringArrayType); + } + + public static String[] tokenize(final String data, final char delim, final int requiredEntries) { + int noOfEntries = 0; + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(); + + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while ((noOfEntries <= requiredEntries) && nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + // get last string + resultList.add(data.substring(currentIndex)); + + return resultList.toArray(stringArrayType); + } + + public static String[] partialTokenize(final String data, final char delim, final int requiredEntries) { + int noOfEntries = 0; + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(requiredEntries + 1); + + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while ((noOfEntries < requiredEntries) && nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + + return resultList.toArray(stringArrayType); + } + + public static String[] tokenizeCharAt(final String data) { + return tokenizeCharAt(data, DELIM); + } + + public static String[] tokenizeCharAt(final String data, final char delim) { + final List resultList = new ArrayList<>(); + + int i = 0; + int length = data.length(); + while (i <= length) { + int start = i; + while (i < length && data.charAt(i) != delim) { + i++; + } + resultList.add(data.substring(start, i)); + // do something with the string here + i++; + } + return resultList.toArray(stringArrayType); + } + + static class Iter implements Iterable, Iterator { + private final String data; + private final char delim; + private int nextIndex; + private int currentIndex = 0; + private String next; + private boolean lastRecord = false; + + public Iter(String data, char delim) { + this.data = data; + this.delim = delim; + readNext(); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return null != next; + } + + @Override + public String next() { + if (!hasNext()) throw new NoSuchElementException(); + + String s = next; + readNext(); + return s; + } + + private void readNext() { + if (lastRecord) { + next = null; + return; + } + nextIndex = data.indexOf(delim, currentIndex); + if (nextIndex < 0) { + lastRecord = true; + next = data.substring(currentIndex); + } else { + next = data.substring(currentIndex, nextIndex); + currentIndex = nextIndex + 1; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + } } diff --git a/qcommon/src/org/qcmg/common/vcf/VcfRecord.java b/qcommon/src/org/qcmg/common/vcf/VcfRecord.java index 2eef99854..694e61ee4 100644 --- a/qcommon/src/org/qcmg/common/vcf/VcfRecord.java +++ b/qcommon/src/org/qcmg/common/vcf/VcfRecord.java @@ -31,7 +31,7 @@ public class VcfRecord implements Comparable { static final QLogger logger = QLoggerFactory.getLogger(VcfRecord.class); - static final Comparator CHR_POS_COMPARATOR = ChrPositionComparator.getComparator(ChrPositionComparator.getChrNameComparator(null)); + static final Comparator CHR_POS_COMPARATOR = ChrPositionComparator.getComparator(ChrPositionComparator.getChrNameComparator((List)null)); private final ChrPosition cpp; private final String ref; @@ -266,7 +266,7 @@ public void setFormatFields(List field) { */ public VcfFormatFieldRecord getSampleFormatRecord(int index){ String s = (index >= formatRecords.size() || index == 0)? null: formatRecords.get(index); - return (s == null)? null : new VcfFormatFieldRecord(formatRecords.get(0), s); + return (s == null)? null : new VcfFormatFieldRecord(formatRecords.getFirst(), s); } /** @@ -276,7 +276,7 @@ public VcfFormatFieldRecord getSampleFormatRecord(int index){ */ public List getFormatFields() { // return a copy of this - if( formatRecords.size() == 0 ) return Collections.emptyList(); + if(formatRecords.isEmpty()) return Collections.emptyList(); return new ArrayList<>(formatRecords); } @@ -410,16 +410,15 @@ public int compareTo(VcfRecord arg0) { if (null != ref && null != arg0.ref) { diff = ref.compareTo(arg0.ref); - } else if (null != ref && null == arg0.ref) { + } else if (null != ref) { diff = -1; - } else if (null == ref && null != arg0.ref) { + } else if (null != arg0.ref) { diff = 1; } else { /* * both null */ - diff = 0; - } + } if (diff != 0) { return diff; @@ -430,16 +429,15 @@ public int compareTo(VcfRecord arg0) { */ if (null != alt && null != arg0.alt) { diff = alt.compareTo(arg0.alt); - } else if (null != alt && null == arg0.alt) { + } else if (null != alt) { diff = -1; - } else if (null == alt && null != arg0.alt) { + } else if (null != arg0.alt) { diff = 1; } else { /* * both null */ - diff = 0; - } + } if (diff != 0) { return diff; } diff --git a/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java b/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java index 147b6ce6a..91549a9a8 100644 --- a/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java +++ b/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java @@ -2,9 +2,7 @@ import static org.junit.Assert.*; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; +import java.util.*; import org.junit.Test; import org.qcmg.common.vcf.VcfRecord; @@ -38,7 +36,7 @@ public void vcfComp() { @Test public void vcfComp2() { - List contigs = Arrays.asList("chr1"); + List contigs = List.of("chr1"); Comparator c = ChrPositionComparator.getVcfRecordComparator(contigs); VcfRecord v1 = VcfUtils.createVcfRecord("chr1", 100); @@ -124,7 +122,11 @@ public void cpSortingReferenceAgnostic2() { @Test public void qsigComparatorTesting() { List contigOrder = Arrays.asList("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chrX", "chrY", "GL000199.1", "GL000216.1", "chrMT"); - Comparator cpc = ChrPositionComparator.getChrNameComparator(contigOrder); + Map contigOrderMap = new HashMap<>(); + for (int i = 0; i < contigOrder.size(); i++) { + contigOrderMap.put(contigOrder.get(i), i); + } + Comparator cpc = ChrPositionComparator.getChrNameComparator(contigOrderMap); assertEquals(-1, cpc.compare("chr1", "chr2")); assertEquals(1, cpc.compare("chr2", "chr1")); @@ -135,5 +137,19 @@ public void qsigComparatorTesting() { assertEquals(1, cpc.compare("chrGL000216.1", "chrMT")); assertEquals(-22, cpc.compare("chr1", "chrMT")); } + + @Test + public void testShortcutComparator() { + Comparator cpc = ChrPositionComparator.getChrNameComparatorNoChrsOneToM(); + + assertEquals(-1, cpc.compare("1", "2")); + assertEquals(1, cpc.compare("2", "1")); + assertEquals(0, cpc.compare("2", "2")); + assertEquals(-1, cpc.compare("M", "GL000199.1")); + assertEquals(1, cpc.compare("GL000216.1", "M")); + assertEquals(-24, cpc.compare("1", "M")); + assertEquals(24, cpc.compare("M", "1")); + assertEquals(0, cpc.compare("M", "M")); + } } diff --git a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java index b0ed74539..4267a56a4 100644 --- a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java +++ b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java @@ -36,6 +36,17 @@ public void testDelta() { assertEquals(true, ChrPositionUtils.arePositionsWithinDelta(cp1, cp2, 4)); } + @Test + public void testConvertChrPositionToLong() { + long expected = ((long) 4 << 32) + 9; + long actual = ChrPositionUtils.convertContigAndPositionToLong("4", 9); + assertEquals(expected, actual); + + ChrPosition cp = ChrPositionUtils.convertLongToChrPosition(actual); + assertEquals("4", cp.getChromosome()); + assertEquals(9, cp.getStartPosition()); + + } @Test public void toVcfStringShouldReturnCorrectFormat() { ChrPosition cp = new ChrRangePosition("chr1", 1000, 2000); diff --git a/qio/src/org/qcmg/qio/record/RecordReader.java b/qio/src/org/qcmg/qio/record/RecordReader.java index 138db25d7..8a63c4011 100644 --- a/qio/src/org/qcmg/qio/record/RecordReader.java +++ b/qio/src/org/qcmg/qio/record/RecordReader.java @@ -1,7 +1,7 @@ /** * © Copyright The University of Queensland 2010-2014. * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. */ @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.List; import java.util.zip.GZIPInputStream; + import org.qcmg.common.util.FileUtils; @@ -29,117 +30,117 @@ public abstract class RecordReader implements Closeable, Iterable { public static final int DEFAULT_BUFFER_SIZE = 65536; public static final String DEFAULT_HEADER_PREFIX = null; //no header line public static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8; - + protected final File file; //only allow create once protected final BufferedReader bin; - protected T next; - + protected T next; + protected List headerLines = new ArrayList<>(); - - public RecordReader(final File file) throws IOException { - this(file, DEFAULT_BUFFER_SIZE); + + public RecordReader(final File file) throws IOException { + this(file, DEFAULT_BUFFER_SIZE); } - + public RecordReader(final File file, int bufferSize) throws IOException { - this(file, bufferSize, DEFAULT_HEADER_PREFIX, DEFAULT_CHARSET); - } - - public RecordReader(final File file, CharSequence headerPrefix) throws IOException { - this(file, DEFAULT_BUFFER_SIZE, headerPrefix, DEFAULT_CHARSET); + this(file, bufferSize, DEFAULT_HEADER_PREFIX, DEFAULT_CHARSET); + } + + public RecordReader(final File file, CharSequence headerPrefix) throws IOException { + this(file, DEFAULT_BUFFER_SIZE, headerPrefix, DEFAULT_CHARSET); } - + public RecordReader(final File file, int bufferSize, CharSequence headerPrefix, Charset charset) throws IOException { this.file = file; - boolean isGzip = FileUtils.isInputGZip( file); - InputStream inputStream = (isGzip) ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); + boolean isGzip = FileUtils.isInputGZip(file); + InputStream inputStream = (isGzip) ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); InputStreamReader streamReader = new InputStreamReader(inputStream, charset); bin = new BufferedReader(streamReader, bufferSize); - - String nextLine = readHeaderAndReturnFirstNonHeaderLine(headerPrefix); - //get first record, set to null for empty file - next = nextLine == null ? null : getRecord(nextLine); - + + String nextLine = readHeaderAndReturnFirstNonHeaderLine(headerPrefix); + //get first record, set to null for empty file + next = nextLine == null ? null : getRecord(nextLine); + } + /** * this method is overridable in subclass, eg illumina file have different header pattern - * + * * @param headerPrefix * @return the first line just after header * @throws IOException */ - public String readHeaderAndReturnFirstNonHeaderLine(CharSequence headerPrefix ) throws IOException { - - String nextLine = bin.readLine(); - - //keep empty header and return first nonHeaderline - if (headerPrefix == null) return nextLine; - - //reader header, hence file pointer to first line after header - while ( nextLine != null && nextLine.startsWith(headerPrefix + "") ) { - headerLines.add(nextLine); - //reset current read line - nextLine = bin.readLine(); - } - - return nextLine; + public String readHeaderAndReturnFirstNonHeaderLine(CharSequence headerPrefix) throws IOException { + + String nextLine = bin.readLine(); + + //keep empty header and return first nonHeaderline + if (headerPrefix == null) return nextLine; + + //reader header, hence file pointer to first line after header + while (nextLine != null && nextLine.startsWith(headerPrefix + "")) { + headerLines.add(nextLine); + //reset current read line + nextLine = bin.readLine(); + } + + return nextLine; } - - /** - * This reader can maximum take Integer.max lines of file header. Please make other header if bigger than this. - * @return a list of header lines - */ - public List getHeader() { - return headerLines; + + /** + * This reader can maximum take Integer.MAX lines of file header. Please make other header if bigger than this. + * + * @return a list of header lines + */ + public List getHeader() { + return headerLines; } @Override /** * Here, BufferedReader.close() calls InputStreamReader.close(), which API told us that it Closes the stream and releases any system resources associated with it. - */ - public void close() throws IOException { - bin.close(); + */ public void close() throws IOException { + bin.close(); } public File getFile() { - return file; + return file; } - @Override - public Iterator iterator() { - Iterator iter = new Iterator() { + @Override + public Iterator iterator() { + + return new Iterator<>() { @Override - public boolean hasNext() { - return null != next; + public boolean hasNext() { + return null != next; } - - @Override - //return the stored record (next), even it is null - public T next() { - T rec = next; - next = null; //in case exception happen, same line repeatedly - - try { - //get next record, it may read multi lines - String line = bin.readLine(); - if ( line != null ) { - next = getRecord( line ); - } - - return rec; - } catch (IOException e) { - //here we only catch IO exception - throw new UncheckedIOException(e); - } + + @Override + //return the stored record (next), even it is null + public T next() { + T rec = next; + next = null; //in case exception happen, same line repeatedly + + try { + //get next record, it may read multi lines + String line = bin.readLine(); + if (line != null) { + next = getRecord(line); + } + + return rec; + } catch (IOException e) { + //here we only catch IO exception + throw new UncheckedIOException(e); + } } }; - - return iter; - } - - //some record cross multi lines, eg id\nseq\n, this method may call bin.readLine() inside - public abstract T getRecord(String line); + } + + //some record cross multi lines, eg id\nseq\n, this method may call bin.readLine() inside + public abstract T getRecord(String line); } diff --git a/qio/src/org/qcmg/qio/record/StringFileReader.java b/qio/src/org/qcmg/qio/record/StringFileReader.java index 3ffc467d4..1613cac10 100644 --- a/qio/src/org/qcmg/qio/record/StringFileReader.java +++ b/qio/src/org/qcmg/qio/record/StringFileReader.java @@ -29,10 +29,7 @@ public StringFileReader(final File file, int bufferSize, CharSequence headerPref @Override - /** - * return input self even it is null - */ - public String getRecord(String line) { + public String getRecord(String line) { return line; } } \ No newline at end of file diff --git a/qsignature/src/org/qcmg/sig/Generate.java b/qsignature/src/org/qcmg/sig/Generate.java index a6d44c65c..1c12fefb6 100644 --- a/qsignature/src/org/qcmg/sig/Generate.java +++ b/qsignature/src/org/qcmg/sig/Generate.java @@ -200,7 +200,11 @@ private void processBamFiles() throws IOException { * Set chrComparator and * order snps based on bam contig order */ - chrComparator = ChrPositionComparator.getChrNameComparator(bamContigs); + Map contigOrderMap = new LinkedHashMap<>(); + for (int i = 0; i < bamContigs.size(); i++) { + contigOrderMap.put(bamContigs.get(i), i); + } + chrComparator = ChrPositionComparator.getChrNameComparator(contigOrderMap); positionsIterator.sort(bamContigs); /* @@ -251,10 +255,6 @@ private void processIlluminaFiles() throws IOException { logger.info("got following details from illumina file:" + illuminaFile.getName()); logger.info("patient: " + patient + ", sample: " + sample + ", inputType: " + inputType); - if (null != inputType && inputType.length() == 4) { - inputType = inputType.substring(1, 3); - } - /* * load data from snp chip file into map */ From 7f2a02b85a5d1df5f5e562a8c06cdb60d4954cb4 Mon Sep 17 00:00:00 2001 From: Oliver Holmes Date: Wed, 28 Feb 2024 15:41:36 +1000 Subject: [PATCH 2/2] added MT to list of acceptable contig names for shortcut method --- qcommon/src/org/qcmg/common/util/ChrPositionUtils.java | 1 + 1 file changed, 1 insertion(+) diff --git a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java index cd5c5ae8b..8bd252bda 100644 --- a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java +++ b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java @@ -79,6 +79,7 @@ public static int convertContigNameToInt(String contigName) { case "X" -> 23; case "Y" -> 24; case "M" -> 25; + case "MT" -> 25; default -> contigName.hashCode(); }; }