diff --git a/build.gradle b/build.gradle index 442c68c5e..03ff7202c 100644 --- a/build.gradle +++ b/build.gradle @@ -54,6 +54,10 @@ subprojects { checkstyleTest.enabled=false } checkstyleMain.onlyIf {project.hasProperty('checkstyle')} + checkstyleMain { + mustRunAfter test + mustRunAfter compileJava + } dependencies { testImplementation 'junit:junit:4.13.2' diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java index 68a14ebef..5b2a96645 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/Annotate.java @@ -1,371 +1,333 @@ package au.edu.qimr.qannotate.nanno; -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Queue; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.Executor; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; - +import au.edu.qimr.qannotate.Messages; +import au.edu.qimr.qannotate.Options; import org.qcmg.common.log.QLogger; import org.qcmg.common.log.QLoggerFactory; import org.qcmg.common.meta.QExec; import org.qcmg.common.model.ChrPosition; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.vcf.VcfRecord; import org.qcmg.common.vcf.VcfUtils; import org.qcmg.qio.record.RecordWriter; import org.qcmg.qio.vcf.VcfFileReader; -import au.edu.qimr.qannotate.Messages; -import au.edu.qimr.qannotate.Options; +import java.io.File; +import java.io.IOException; +import java.util.*; +import java.util.concurrent.*; +import java.util.stream.Collectors; public class Annotate { - - static final List SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/"); - - static Comparator CUSTOM_COMPARATOR; - static QLogger logger; - - private int exitStatus; - - private String logFile; - private String inputFile; - private String outputFile; - private String jsonInputs; - - private QExec exec; - - public int engage() throws Exception { - - /* - * parse the json file into an AnnotationINputs object - */ - AnnotationInputs ais = AnnotateUtils.getInputs(jsonInputs); - logger.info("Number of annotation source threads to use: " + ais.getAnnotationSourceThreadCount()); - /* - * create a comparator that will be used to sort the annotation fields for output - */ - CUSTOM_COMPARATOR = AnnotateUtils.createComparatorFromList(Arrays.stream(ais.getOutputFieldOrder().split(",")).collect(Collectors.toList())); - logger.info("Custom comparator created"); - /* - * check headers that have been supplied in the json inputs file - */ - int headersOK = AnnotateUtils.checkHeaders(ais); - if (headersOK == 1) { - logger.error("Headers have been checked - not OK!!!"); - System.exit(headersOK); - } - logger.info("Headers have been checked - OK"); - - List annotationSources = new ArrayList<>(); - AnnotateUtils.populateAnnotationSources(ais, annotationSources); - logger.info("annotationSources have been loaded (size: " + annotationSources.size() + ")"); - annotationSources.stream().forEach(as -> logger.info(as.toString())); - - CountDownLatch consumerLatch = new CountDownLatch(1); - Queue queue = new ConcurrentLinkedQueue<>(); - - - ExecutorService executor = Executors.newFixedThreadPool(Math.max(ais.getAnnotationSourceThreadCount(), 1) + 1); // need an extra thread for the consumer, and at least 1 other thread - executor.execute(new Consumer(queue, outputFile, consumerLatch, ais, exec)); - logger.info("ExecutorService has been setup"); - - ChrPosition lastCP = null; - try ( - VcfFileReader reader = new VcfFileReader(inputFile);) { - logger.info("VcfFileReader has been setup"); - int vcfCount = 0; - for (VcfRecord vcf : reader) { - vcfCount++; - - ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt(); - logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString()); - - - /* - * check that this CP is "after" the last CP - */ - int compare = null != lastCP ? ((ChrPositionRefAlt)thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; - if (compare < 0) { - throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); - } - - - String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); - String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); - String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); - - if (alt.contains(",")) { - logger.info("alt has comma: " + thisVcfsCP.toString()); - /* - * split record, create new ChrPositions for each - */ - String [] altArray = alt.split(","); - Map altToADMap = AnnotateUtils.getADForSplitVcfRecords(altArray, gatkAD); - List splitVcfs = new ArrayList<>(); - for (String thisAlt : altArray) { - if (thisAlt.equals("*")) { - /* - * ignore - */ - } else { - VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); - splitVcfs.add(newVcf); - } - } - if (splitVcfs.size() > 1) { - /* - * sort - */ - splitVcfs.sort(null); - } - for (VcfRecord splitVcf : splitVcfs) { - List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); - queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); - } - - } else { - - logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); - List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); - logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); - queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); - - } - - lastCP = thisVcfsCP; - } - - logger.info("# of vcf records: " + vcfCount); - } finally { - /* - * count down the count down latch - */ - consumerLatch.countDown(); - } - executor.shutdown(); - executor.awaitTermination(60, TimeUnit.MINUTES); - logger.info("ExecutorService has been shutdown"); - return exitStatus; - } - - - private static List getAnnotationsForPosition(ChrPosition cp, List annotationSources, Executor executor) { - - return annotationSources.stream() - .map(source -> CompletableFuture.supplyAsync(() -> - source.getAnnotation(cp), executor)) - .map(CompletableFuture::join).collect(Collectors.toList()); - } - - public static class ChrPositionAnnotations { - - public String getGatkAD() { - return gatkAD; - } - - public ChrPosition getCp() { - return cp; - } - - public List getAnnotations() { - return annotations; - } - - public String getGatkGT() { - return gatkGT; - } - - public String getOriginalAlt() { - return originalAlt; - } - - ChrPosition cp; - List annotations; - String gatkAD; - String gatkGT; - String originalAlt; - - public ChrPositionAnnotations(ChrPosition cp, List annotations, String gatkAD, String gatkGT, String originalAlt) { - super(); - this.cp = cp; - this.annotations = annotations; - this.gatkAD = gatkAD; - this.gatkGT = gatkGT; - this.originalAlt = originalAlt; - } - - public String toStringMinusAnnotations() { - return ((ChrPositionRefAlt)cp).toTabSeperatedString() + "\t" + originalAlt + "\t" + gatkGT + "\t" + gatkAD; - } - - } - - public static class Consumer implements Runnable { - - private final Queue queue; - private final String outputFile; - private final boolean includeSearchTerm; - private final CountDownLatch latch; - private final RecordWriter writer; - private final String additionalEmptyValues; - private final AnnotationInputs ais; - - public Consumer(Queue queue, String outputFile, CountDownLatch latch, AnnotationInputs ais, QExec exec) throws IOException { - this.queue = queue; - this.outputFile = outputFile; - this.latch = latch; - this.ais = ais; - includeSearchTerm = ais.isIncludeSearchTerm(); - additionalEmptyValues = AnnotateUtils.generateAdditionalEmptyValues(ais); - List headers = AnnotateUtils.generateHeaders(ais, exec); - - writer = new RecordWriter(new File(outputFile)); - for (String h : headers) { - writer.addHeader(h); - } - } - - @Override - public void run() { - logger.info("Consumer thread is a go!"); - try { - - while (true) { - - final ChrPositionAnnotations rec = queue.poll(); - if (null != rec) { - - processRecord(rec); - - } else { - if (latch.getCount() == 0) { - break; - } - // sleep and try again - try { - Thread.sleep(20); - } catch (final InterruptedException e) { - logger.error("InterruptedException caught in Consumer sleep: " + e.getLocalizedMessage()); - throw e; - } finally { - } - } - } - } catch (final Exception e) { - e.printStackTrace(); - logger.error("Exception caught in Consumer class: " + e.getCause().getMessage()); - } finally { - logger.info("Consumer: shutting down"); - /* - * close writer - */ - try { - writer.close(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - } - } - - public void processRecord(final ChrPositionAnnotations recAndAnnotations) throws IOException { - ChrPosition cp = recAndAnnotations.getCp(); - - - List annotations = recAndAnnotations.getAnnotations(); - logger.debug("annotations.size(): " + annotations.size()); - - /* - * collect entries in annotations lists into map - */ - List singleAnnotations = AnnotateUtils.convertAnnotations(annotations); - logger.debug("singleAnnotations.size(): " + singleAnnotations.size()); - - - String searchTerm = ""; - if (includeSearchTerm) { - String hgvsC = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.c").orElse(null); - String hgvsP = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.p").orElse(null); - searchTerm = AnnotateUtils.getSearchTerm(hgvsC, hgvsP); - } - /* - * sort and write out to file - */ - String annotationString = singleAnnotations.stream().map(s -> s.split("=", 2)).sorted(CUSTOM_COMPARATOR).map(a -> a[1]).collect(Collectors.joining("\t")); - - logger.debug("annotationString: " + annotationString); - - writer.add(recAndAnnotations.toStringMinusAnnotations() + "\t" + annotationString + additionalEmptyValues + (includeSearchTerm ? "\t" + searchTerm : "")); - } - } - - public static void main(String[] args) throws Exception { - final Annotate sp = new Annotate(); - int exitStatus = 0; - try { - exitStatus = sp.setup(args); - } catch (final Exception e) { - exitStatus = 1; - if (null != logger) { - logger.error("Exception caught whilst running Annotate:", e); - } else { - System.err.println("Exception caught whilst running Annotate"); - } - e.printStackTrace(); - } - - if (null != logger) { - logger.logFinalExecutionStats(exitStatus); - } - System.exit(exitStatus); - } - - protected int setup(String args[]) throws Exception { - int returnStatus = 1; - if (null == args || args.length == 0) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - System.exit(1); - } - final Options options = new Options(args); - - System.out.println("options.getInputFileName: " + options.getInputFileName()); - System.out.println("options.getOutputFileName: " + options.getOutputFileName()); - System.out.println("options.getOutputFileName: " + options.getOutputFileName()); - System.out.println("options.getConfigFileName: " + options.getConfigFileName()); - if ( null == options.getInputFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getOutputFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getLogFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else if ( null == options.getConfigFileName()) { - System.err.println(Messages.getMessage("NANNO_USAGE")); - } else { - // configure logging - logFile = options.getLogFileName(); - logger = QLoggerFactory.getLogger(Annotate.class, logFile, options.getLogLevel()); - exec = logger.logInitialExecutionStats("Annotate", Annotate.class.getPackage().getImplementationVersion(), args); - outputFile = options.getOutputFileName(); - inputFile = options.getInputFileName(); - jsonInputs = options.getConfigFileName(); - - return engage(); - } - - return returnStatus; - } + + static final List SEARCH_TERM_VARIETIES = Arrays.asList(">", "->", "-->", "/"); + + static Comparator CUSTOM_COMPARATOR; + static QLogger logger; + + private int exitStatus; + + private String inputFile; + private String outputFile; + private String jsonInputs; + + private QExec exec; + + public int engage() throws Exception { + + /* + * parse the json file into an AnnotationInputs object + */ + AnnotationInputs ais = AnnotateUtils.getInputs(jsonInputs); + logger.info("Number of annotation source threads to use: " + ais.getAnnotationSourceThreadCount()); + /* + * create a comparator that will be used to sort the annotation fields for output + */ + CUSTOM_COMPARATOR = AnnotateUtils.createComparatorFromList(Arrays.stream(ais.getOutputFieldOrder().split(",")).collect(Collectors.toList())); + logger.info("Custom comparator created"); + /* + * check headers that have been supplied in the json inputs file + */ + int headersOK = AnnotateUtils.checkHeaders(ais); + if (headersOK == 1) { + logger.error("Headers have been checked - not OK!!!"); + System.exit(headersOK); + } + logger.info("Headers have been checked - OK"); + + List annotationSources = new ArrayList<>(); + AnnotateUtils.populateAnnotationSources(ais, annotationSources); + logger.info("annotationSources have been loaded (size: " + annotationSources.size() + ")"); + annotationSources.forEach(as -> logger.info(as.toString())); + + CountDownLatch consumerLatch = new CountDownLatch(1); + Queue queue = new ConcurrentLinkedQueue<>(); + + + ExecutorService executor = Executors.newFixedThreadPool(Math.max(ais.getAnnotationSourceThreadCount(), 1) + 1); // need an extra thread for the consumer, and at least 1 other thread + executor.execute(new Consumer(queue, outputFile, consumerLatch, ais, exec)); + logger.info("ExecutorService has been setup"); + + ChrPosition lastCP = null; + try ( + VcfFileReader reader = new VcfFileReader(inputFile)) { + logger.info("VcfFileReader has been setup"); + int vcfCount = 0; + for (VcfRecord vcf : reader) { + vcfCount++; + + ChrPosition thisVcfsCP = vcf.getChrPositionRefAlt(); + logger.debug("thisVcfsCP: " + thisVcfsCP.toIGVString()); + + + /* + * check that this CP is "after" the last CP + */ + int compare = null != lastCP ? ((ChrPositionRefAlt) thisVcfsCP).compareTo((ChrPositionRefAlt) lastCP) : 0; + if (compare < 0) { + throw new IllegalArgumentException("Incorrect order of vcf records in input vcf file! this vcf: " + thisVcfsCP.toIGVString() + ", last vcf: " + lastCP.toIGVString()); + } + + + String alt = ((ChrPositionRefAlt) thisVcfsCP).getAlt(); + String gatkAD = VcfUtils.getFormatField(vcf.getFormatFields(), "AD", 0); + String gatkGT = VcfUtils.getFormatField(vcf.getFormatFields(), "GT", 0); + + if (alt.contains(",")) { + logger.info("alt has comma: " + thisVcfsCP); + /* + * split record, create new ChrPositions for each + */ + String[] altArray = alt.split(","); + List splitVcfs = new ArrayList<>(); + for (String thisAlt : altArray) { + if (thisAlt.equals("*")) { + /* + * ignore + */ + } else { + VcfRecord newVcf = VcfUtils.cloneWithNewAlt(vcf, thisAlt); + splitVcfs.add(newVcf); + } + } + if (splitVcfs.size() > 1) { + /* + * sort + */ + splitVcfs.sort(null); + } + for (VcfRecord splitVcf : splitVcfs) { + List annotations = new ArrayList<>(getAnnotationsForPosition(splitVcf.getChrPositionRefAlt(), annotationSources, executor)); + queue.add(new ChrPositionAnnotations(splitVcf.getChrPositionRefAlt(), annotations, gatkAD, gatkGT, alt)); + } + + } else { + + logger.debug("about to get annotations for: " + thisVcfsCP.toIGVString()); + List annotations = getAnnotationsForPosition(thisVcfsCP, annotationSources, executor); + logger.debug("got annotations for: " + thisVcfsCP.toIGVString() + " - adding to queue"); + queue.add(new ChrPositionAnnotations(thisVcfsCP, annotations, gatkAD, gatkGT, alt)); + + } + + lastCP = thisVcfsCP; + } + + logger.info("# of vcf records: " + vcfCount); + } finally { + /* + * count down the count down latch + */ + consumerLatch.countDown(); + } + executor.shutdown(); + executor.awaitTermination(60, TimeUnit.MINUTES); + logger.info("ExecutorService has been shutdown"); + return exitStatus; + } + + + private static List getAnnotationsForPosition(ChrPosition cp, List annotationSources, Executor executor) { + long contigAndPosition = ((ChrPositionUtils.convertContigAndPositionToLong(cp.getChromosome().startsWith("chr") ? cp.getChromosome().substring(3) : cp.getChromosome(), cp.getStartPosition()))); + return annotationSources.stream() + .map(source -> CompletableFuture.supplyAsync(() -> + source.getAnnotation(contigAndPosition, cp), executor)) + .map(CompletableFuture::join).collect(Collectors.toList()); + } + + public static class ChrPositionAnnotations { + + public List getAnnotations() { + return annotations; + } + + ChrPosition cp; + List annotations; + String gatkAD; + String gatkGT; + String originalAlt; + + public ChrPositionAnnotations(ChrPosition cp, List annotations, String gatkAD, String gatkGT, String originalAlt) { + super(); + this.cp = cp; + this.annotations = annotations; + this.gatkAD = gatkAD; + this.gatkGT = gatkGT; + this.originalAlt = originalAlt; + } + + public String toStringMinusAnnotations() { + return ((ChrPositionRefAlt) cp).toTabSeperatedString() + "\t" + originalAlt + "\t" + gatkGT + "\t" + gatkAD; + } + + } + + public static class Consumer implements Runnable { + + private final Queue queue; + private final boolean includeSearchTerm; + private final CountDownLatch latch; + private final RecordWriter writer; + private final String additionalEmptyValues; + + public Consumer(Queue queue, String outputFile, CountDownLatch latch, AnnotationInputs ais, QExec exec) throws IOException { + this.queue = queue; + this.latch = latch; + includeSearchTerm = ais.isIncludeSearchTerm(); + additionalEmptyValues = AnnotateUtils.generateAdditionalEmptyValues(ais); + List headers = AnnotateUtils.generateHeaders(ais, exec); + + writer = new RecordWriter<>(new File(outputFile)); + for (String h : headers) { + writer.addHeader(h); + } + } + + @Override + public void run() { + logger.info("Consumer thread is a go!"); + try { + + while (true) { + + final ChrPositionAnnotations rec = queue.poll(); + if (null != rec) { + + processRecord(rec); + + } else { + if (latch.getCount() == 0) { + break; + } + // sleep and try again + try { + Thread.sleep(20); + } catch (final InterruptedException e) { + logger.error("InterruptedException caught in Consumer sleep: " + e.getLocalizedMessage()); + throw e; + } + } + } + } catch (final Exception e) { + e.printStackTrace(); + logger.error("Exception caught in Consumer class: " + e.getCause().getMessage()); + } finally { + logger.info("Consumer: shutting down"); + /* + * close writer + */ + try { + writer.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + public void processRecord(final ChrPositionAnnotations recAndAnnotations) throws IOException { + + List annotations = recAndAnnotations.getAnnotations(); + logger.debug("annotations.size(): " + annotations.size()); + + /* + * collect entries in annotations lists into map + */ + List singleAnnotations = AnnotateUtils.convertAnnotations(annotations); + logger.debug("singleAnnotations.size(): " + singleAnnotations.size()); + + + String searchTerm = ""; + if (includeSearchTerm) { + String hgvsC = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.c").orElse(null); + String hgvsP = AnnotateUtils.getAnnotationFromList(singleAnnotations, "hgvs.p").orElse(null); + searchTerm = AnnotateUtils.getSearchTerm(hgvsC, hgvsP); + } + /* + * sort and write out to file + */ + String annotationString = singleAnnotations.stream().map(s -> s.split("=", 2)).sorted(CUSTOM_COMPARATOR).map(a -> a[1]).collect(Collectors.joining("\t")); + + logger.debug("annotationString: " + annotationString); + + writer.add(recAndAnnotations.toStringMinusAnnotations() + "\t" + annotationString + additionalEmptyValues + (includeSearchTerm ? "\t" + searchTerm : "")); + } + } + + public static void main(String[] args) { + final Annotate sp = new Annotate(); + int exitStatus = 0; + try { + exitStatus = sp.setup(args); + } catch (final Exception e) { + exitStatus = 1; + if (null != logger) { + logger.error("Exception caught whilst running Annotate:", e); + } else { + System.err.println("Exception caught whilst running Annotate"); + } + e.printStackTrace(); + } + + if (null != logger) { + logger.logFinalExecutionStats(exitStatus); + } + System.exit(exitStatus); + } + + protected int setup(String [] args) throws Exception { + int returnStatus = 1; + if (null == args || args.length == 0) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + System.exit(1); + } + final Options options = new Options(args); + + System.out.println("options.getInputFileName: " + options.getInputFileName()); + System.out.println("options.getOutputFileName: " + options.getOutputFileName()); + System.out.println("options.getConfigFileName: " + options.getConfigFileName()); + if (null == options.getInputFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getOutputFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getLogFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else if (null == options.getConfigFileName()) { + System.err.println(Messages.getMessage("NANNO_USAGE")); + } else { + // configure logging + String logFile = options.getLogFileName(); + logger = QLoggerFactory.getLogger(Annotate.class, logFile, options.getLogLevel()); + exec = logger.logInitialExecutionStats("Annotate", Annotate.class.getPackage().getImplementationVersion(), args); + outputFile = options.getOutputFileName(); + inputFile = options.getInputFileName(); + jsonInputs = options.getConfigFileName(); + + return engage(); + } + + return returnStatus; + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java index 05f5e5a95..c454893e4 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotateUtils.java @@ -1,21 +1,8 @@ package au.edu.qimr.qannotate.nanno; -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Paths; -import java.time.LocalDateTime; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Optional; -import java.util.Set; -import java.util.stream.Collectors; - +import au.edu.qimr.qannotate.nanno.AnnotationInputs.AnnotationInput; +import com.fasterxml.jackson.databind.DeserializationFeature; +import com.fasterxml.jackson.databind.ObjectMapper; import org.qcmg.common.log.QLogger; import org.qcmg.common.log.QLoggerFactory; import org.qcmg.common.meta.QExec; @@ -23,273 +10,271 @@ import org.qcmg.common.util.TabTokenizer; import org.qcmg.qio.record.StringFileReader; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; - -import au.edu.qimr.qannotate.nanno.AnnotationInputs.AnnotationInput; +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.*; +import java.util.stream.Collectors; public class AnnotateUtils { - - public static final QLogger logger = QLoggerFactory.getLogger(AnnotateUtils.class); - - public static AnnotationInputs getInputs(String file) throws IOException { - //read json file data to String - byte[] jsonData = Files.readAllBytes(Paths.get(file)); - //create ObjectMapper instance - ObjectMapper objectMapper = new ObjectMapper(); - objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - //convert json string to object - AnnotationInputs ai = objectMapper.readValue(jsonData, AnnotationInputs.class); - - return ai; - } - - public static Comparator createComparatorFromList(final List sortedList) { - Comparator c = new Comparator() { - @Override - public int compare(String[] o1, String[] o2) { - final int index1 = sortedList.indexOf(o1[0]); - if (index1 == -1) return 1; - final int index2 = sortedList.indexOf(o2[0]); - if (index2 == -1) return -1; - return index1 - index2; - } - }; - return c; - } - - /** - * @param ais - * @param annotationSources - * @throws IOException - */ - public static void populateAnnotationSources(AnnotationInputs ais, List annotationSources) throws IOException { - for (AnnotationInput ai : ais.getInputs()) { - String fileName = ai.getFile(); - String fieldNames = ai.getFields(); - - logger.info("fileName: " + fileName + ", positions: " + ai.getChrIndex() + ", " + ai.getPositionIndex() + ", " + ai.getRefIndex() + ", " + ai.getAltIndex() + ", fieldNames: " + fieldNames); - - if (ai.isSnpEffVcf()) { - annotationSources.add(new AnnotationSourceSnpEffVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } else if (fileName.contains("vcf")) { - annotationSources.add(new AnnotationSourceVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } else { - annotationSources.add(new AnnotationSourceTSV(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames)); - } - } - } - - public static int checkHeaders(AnnotationInputs ais) { - List annotationFields = ais.getInputs().stream().map(ai -> ai.getFields()).collect(Collectors.toList()); - boolean headersValid = AnnotateUtils.isOrderedHeaderListValid(ais.getOutputFieldOrder(), annotationFields.toArray(new String[]{})); - - if ( ! headersValid) { - System.err.println("headers are not valid! OrderedHeader: " + ais.getOutputFieldOrder() + "\nAnnotation fields: " + (ais.getInputs().stream().map(ai -> ai.getFields())).collect(Collectors.joining(","))); - return 1; - } - return 0; - } - - /** - * checks to see if the sortedHEader contains all the fields from the various annotation sources - * Not sure what to do if we have 2 fields with the same name (presumably from different sources) - * - * - * @param ai - * @return - */ - public static boolean isOrderedHeaderListValid(String sortedHeader, String ... fieldsFromAnnotationSources) { - if (StringUtils.isNullOrEmpty(sortedHeader)) { - /* - * empty or null sorted header - not valid - */ - logger.error("sortedHeader is null or empty"); - return false; - } - if (null == fieldsFromAnnotationSources || fieldsFromAnnotationSources.length == 0) { - /* - * empty or null annotation fields - not valid - */ - logger.error("fieldsFromAnnotationSources is null or length is 0"); - return false; - } - - Set sortedHeaderSet = Arrays.stream(sortedHeader.split(",")).collect(Collectors.toSet()); - Set fieldsFromAnnotationSourcesSet = Arrays.stream(String.join(",", fieldsFromAnnotationSources).split(",")).collect(Collectors.toSet()); - - for (String s : sortedHeaderSet) { - if ( ! fieldsFromAnnotationSourcesSet.contains(s)) { - logger.error(s + " in header but not found in any data source!"); - } - } - for (String s : fieldsFromAnnotationSourcesSet) { - if ( ! sortedHeaderSet.contains(s)) { - logger.error(s + " in data source but not found in header!"); - } - } - - return sortedHeaderSet.containsAll(fieldsFromAnnotationSourcesSet) && fieldsFromAnnotationSourcesSet.containsAll(sortedHeaderSet); - } - - public static String getEmptyHeaderValues(int count) { - if (count <= 0) { - return ""; - } - return org.apache.commons.lang3.StringUtils.repeat("\t", count); - } - - public static int countOccurrences(String s, String t) { - return org.apache.commons.lang3.StringUtils.countMatches(s, t); - } - - /** - * Create a PubMed search term using the hgvsC and hgvsP values - * @param hgvsC - * @param hgvsP - * @return - */ - public static String getSearchTerm(String hgvsC, String hgvsP) { - String st = ""; - - /* - * check the optionals - if they are both not present, no need to proceed - */ - if (( hgvsC == null && hgvsP == null)) { - return st; - } - - if ( hgvsC != null && ! hgvsC.isEmpty()) { - - /* - * need to check that the string contains the dot ('.') and the gt sign ('>') - */ - int dotIndex = hgvsC.indexOf('.'); - int gtIndex = hgvsC.indexOf('>'); - if (dotIndex > -1 && gtIndex > -1) { - - /* - * split value into required parts - */ - String firstPart = hgvsC.substring(dotIndex + 1, gtIndex); - String secondPart = hgvsC.substring(gtIndex + 1); - - st += Annotate.SEARCH_TERM_VARIETIES.stream().map(s -> "\"" + firstPart + s + secondPart + "\"").collect(Collectors.joining("|")); - } - } - - if ( hgvsP != null && ! hgvsP.isEmpty()) { - if ( ! st.isEmpty()) { - /* - * we must have hgvs.c data - so add bar - */ - st += "|"; - } - st += "\"" + hgvsP.substring(hgvsP.indexOf('.') + 1) + "\""; - } - - if ( ! st.isEmpty()) { - return "\"GENE\"+(" + st + ")"; - } - return st; - } - - /** - * Splits the strings in the supplied list by tab, and flattens them to a single list - */ - public static List convertAnnotations(List manyAnnotations) { - if (null != manyAnnotations) { - return manyAnnotations.stream().flatMap(s -> java.util.Arrays.stream(TabTokenizer.tokenize(s))).collect(Collectors.toList()); - } - return Collections.emptyList(); - } - - /** - * get the requiredAnnotation value from the list of annotations - * return null if not present - * - * @param listOfAnnotations - * @param requiredAnnotation - * @return - */ - public static Optional getAnnotationFromList(List listOfAnnotations, String requiredAnnotation) { - - if (null != listOfAnnotations && ! StringUtils.isNullOrEmpty(requiredAnnotation)) { - for (String anno : listOfAnnotations) { - if (anno.startsWith(requiredAnnotation)) { - return Optional.of(anno.substring(requiredAnnotation.length() + 1)); // don't forget the equals sign - } - } - } - return Optional.empty(); - } - - /** - * - * @param altArray - * @param gatkAD - * @return - */ - public static Map getADForSplitVcfRecords(String [] altArray, String gatkAD) { - - Map altToADMap = new HashMap<>(4); - String [] gatkADArray = gatkAD.split(","); - /* - * should have 1 more in the gatkADArray than the altArray - */ - if (altArray.length == gatkADArray.length - 1) { - for (int i = 0 ; i < altArray.length ; i++) { - altToADMap.put(altArray[i], gatkADArray[0] + "," + gatkADArray[i + 1]); - } - } - - return altToADMap; - } - - public static List generateHeaders(AnnotationInputs ais, QExec exec) { - List headers = new ArrayList<>(); - if (null != exec) { - headers.add("##" + exec.getStartTime().toLogString()); - headers.add("##" + exec.getUuid().toLogString()); - headers.add("##" + exec.getHost().toLogString()); - headers.add("##" + exec.getRunBy().toLogString()); - headers.add("##" + exec.getJavaVersion().toLogString()); - headers.add("##" + exec.getToolName().toLogString()); - headers.add("##" + exec.getToolVersion().toLogString()); - headers.add("##" + exec.getCommandLine().toLogString()); - } - if (null != ais && null != ais.getInputs()) { - - for (AnnotationInput ai : ais.getInputs()) { - headers.add("##file:fields\t" + ai.getFile() + ":" + ai.getFields()); - } - - String emptyHeaders = ais.getAdditionalEmptyFields(); - String [] emptyHeadersArray = StringUtils.isNullOrEmpty(emptyHeaders) ? new String[]{} : emptyHeaders.split(","); - String fieldOrder = ais.getOutputFieldOrder(); - String [] fieldOrderArray = StringUtils.isNullOrEmpty(fieldOrder) ? new String[]{} : fieldOrder.split(","); - - String header = "#chr\tposition\tref\talt\toriginal_alt\tGATK_GT\tGATK_AD\t" + Arrays.stream(fieldOrderArray).collect(Collectors.joining("\t")); - if (emptyHeadersArray.length > 0) { - header += "\t" + Arrays.stream(emptyHeadersArray).collect(Collectors.joining("\t")); - } - - boolean includeSearchTerm = ais.isIncludeSearchTerm(); - header += (includeSearchTerm ? "\tsearchTerm" : ""); - headers.add(header); - } - - return headers; - } - - public static String generateAdditionalEmptyValues(AnnotationInputs ais) { - String emptyHeaders = ais.getAdditionalEmptyFields(); - - if (StringUtils.isNullOrEmpty(emptyHeaders)) { - return ""; - } else { - return getEmptyHeaderValues(org.apache.commons.lang3.StringUtils.countMatches(emptyHeaders, ",") + 1); - } - } + + public static final QLogger logger = QLoggerFactory.getLogger(AnnotateUtils.class); + + public static AnnotationInputs getInputs(String file) throws IOException { + //read json file data to String + byte[] jsonData = Files.readAllBytes(Paths.get(file)); + //create ObjectMapper instance + ObjectMapper objectMapper = new ObjectMapper(); + objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); + //convert json string to object + + return objectMapper.readValue(jsonData, AnnotationInputs.class); + } + + public static Comparator createComparatorFromList(final List sortedList) { + return (o1, o2) -> { + final int index1 = sortedList.indexOf(o1[0]); + if (index1 == -1) return 1; + final int index2 = sortedList.indexOf(o2[0]); + if (index2 == -1) return -1; + return index1 - index2; + }; + } + + /** + * @param ais + * @param annotationSources + * @throws IOException + */ + public static void populateAnnotationSources(AnnotationInputs ais, List annotationSources) throws IOException { + for (AnnotationInput ai : ais.getInputs()) { + String fileName = ai.getFile(); + String fieldNames = ai.getFields(); + + logger.info("fileName: " + fileName + ", positions: " + ai.getChrIndex() + ", " + ai.getPositionIndex() + ", " + ai.getRefIndex() + ", " + ai.getAltIndex() + ", fieldNames: " + fieldNames); + + if (ai.isSnpEffVcf()) { + annotationSources.add(new AnnotationSourceSnpEffVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } else if (fileName.contains("vcf")) { + annotationSources.add(new AnnotationSourceVCF(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } else { + annotationSources.add(new AnnotationSourceTSV(new StringFileReader(new File(fileName), 1024 * 1024), ai.getChrIndex(), ai.getPositionIndex(), ai.getRefIndex(), ai.getAltIndex(), fieldNames, ai.isChrStartsWithChr())); + } + } + } + + public static int checkHeaders(AnnotationInputs ais) { + List annotationFields = ais.getInputs().stream().map(AnnotationInput::getFields).toList(); + boolean headersValid = AnnotateUtils.isOrderedHeaderListValid(ais.getOutputFieldOrder(), annotationFields.toArray(new String[]{})); + + if (!headersValid) { + System.err.println("headers are not valid! OrderedHeader: " + ais.getOutputFieldOrder() + "\nAnnotation fields: " + (ais.getInputs().stream().map(AnnotationInput::getFields)).collect(Collectors.joining(","))); + return 1; + } + return 0; + } + + /** + * Checks if the ordered list of headers is valid. + * + * @param sortedHeader The sorted header string. + * @param fieldsFromAnnotationSources The fields from annotation sources. + * @return Returns true if the ordered header list is valid, false otherwise. + */ + public static boolean isOrderedHeaderListValid(String sortedHeader, String... fieldsFromAnnotationSources) { + if (StringUtils.isNullOrEmpty(sortedHeader)) { + /* + * empty or null sorted header - not valid + */ + logger.error("sortedHeader is null or empty"); + return false; + } + if (null == fieldsFromAnnotationSources || fieldsFromAnnotationSources.length == 0) { + /* + * empty or null annotation fields - not valid + */ + logger.error("fieldsFromAnnotationSources is null or length is 0"); + return false; + } + + Set sortedHeaderSet = Arrays.stream(sortedHeader.split(",")).collect(Collectors.toSet()); + Set fieldsFromAnnotationSourcesSet = Arrays.stream(String.join(",", fieldsFromAnnotationSources).split(",")).collect(Collectors.toSet()); + + for (String s : sortedHeaderSet) { + if (!fieldsFromAnnotationSourcesSet.contains(s)) { + logger.error(s + " in header but not found in any data source!"); + } + } + for (String s : fieldsFromAnnotationSourcesSet) { + if (!sortedHeaderSet.contains(s)) { + logger.error(s + " in data source but not found in header!"); + } + } + + return sortedHeaderSet.containsAll(fieldsFromAnnotationSourcesSet) && fieldsFromAnnotationSourcesSet.containsAll(sortedHeaderSet); + } + + public static String getEmptyHeaderValues(int count) { + if (count <= 0) { + return ""; + } + return org.apache.commons.lang3.StringUtils.repeat("\t", count); + } + + public static int countOccurrences(String s, String t) { + return org.apache.commons.lang3.StringUtils.countMatches(s, t); + } + + /** + * Create a PubMed search term using the hgvsC and hgvsP values + * + * @param hgvsC + * @param hgvsP + * @return + */ + public static String getSearchTerm(String hgvsC, String hgvsP) { + String st = ""; + + /* + * check the optionals - if they are both not present, no need to proceed + */ + if ((hgvsC == null && hgvsP == null)) { + return st; + } + + if (hgvsC != null && !hgvsC.isEmpty()) { + + /* + * need to check that the string contains the dot ('.') and the gt sign ('>') + */ + int dotIndex = hgvsC.indexOf('.'); + int gtIndex = hgvsC.indexOf('>'); + if (dotIndex > -1 && gtIndex > -1) { + + /* + * split value into required parts + */ + String firstPart = hgvsC.substring(dotIndex + 1, gtIndex); + String secondPart = hgvsC.substring(gtIndex + 1); + + st += Annotate.SEARCH_TERM_VARIETIES.stream().map(s -> "\"" + firstPart + s + secondPart + "\"").collect(Collectors.joining("|")); + } + } + + if (hgvsP != null && !hgvsP.isEmpty()) { + if (!st.isEmpty()) { + /* + * we must have hgvs.c data - so add bar + */ + st += "|"; + } + st += "\"" + hgvsP.substring(hgvsP.indexOf('.') + 1) + "\""; + } + + if (!st.isEmpty()) { + return "\"GENE\"+(" + st + ")"; + } + return st; + } + + /** + * Splits the strings in the supplied list by tab, and flattens them to a single list + */ + public static List convertAnnotations(List manyAnnotations) { + if (null != manyAnnotations) { + return manyAnnotations.stream().flatMap(s -> java.util.Arrays.stream(TabTokenizer.tokenize(s))).collect(Collectors.toList()); + } + return Collections.emptyList(); + } + + /** + * get the requiredAnnotation value from the list of annotations + * return null if not present + * + * @param listOfAnnotations + * @param requiredAnnotation + * @return + */ + public static Optional getAnnotationFromList(List listOfAnnotations, String requiredAnnotation) { + + if (null != listOfAnnotations && !StringUtils.isNullOrEmpty(requiredAnnotation)) { + for (String anno : listOfAnnotations) { + if (anno.startsWith(requiredAnnotation)) { + return Optional.of(anno.substring(requiredAnnotation.length() + 1)); // don't forget the equals sign + } + } + } + return Optional.empty(); + } + + /** + * Retrieves the AD (allele depth) values for split VCF records. + * + * @param altArray An array of alternate alleles. + * @param gatkAD The GATK AD field containing comma-separated allele depth values. + * @return A map of alternate alleles to their corresponding AD values. + */ + public static Map getADForSplitVcfRecords(String[] altArray, String gatkAD) { + + Map altToADMap = new HashMap<>(4); + String[] gatkADArray = gatkAD.split(","); + /* + * should have 1 more in the gatkADArray than the altArray + */ + if (altArray.length == gatkADArray.length - 1) { + for (int i = 0; i < altArray.length; i++) { + altToADMap.put(altArray[i], gatkADArray[0] + "," + gatkADArray[i + 1]); + } + } + + return altToADMap; + } + + public static List generateHeaders(AnnotationInputs ais, QExec exec) { + List headers = new ArrayList<>(); + if (null != exec) { + headers.add("##" + exec.getStartTime().toLogString()); + headers.add("##" + exec.getUuid().toLogString()); + headers.add("##" + exec.getHost().toLogString()); + headers.add("##" + exec.getRunBy().toLogString()); + headers.add("##" + exec.getJavaVersion().toLogString()); + headers.add("##" + exec.getToolName().toLogString()); + headers.add("##" + exec.getToolVersion().toLogString()); + headers.add("##" + exec.getCommandLine().toLogString()); + } + if (null != ais && null != ais.getInputs()) { + + for (AnnotationInput ai : ais.getInputs()) { + headers.add("##file:fields\t" + ai.getFile() + ":" + ai.getFields()); + } + + String emptyHeaders = ais.getAdditionalEmptyFields(); + String[] emptyHeadersArray = StringUtils.isNullOrEmpty(emptyHeaders) ? new String[]{} : emptyHeaders.split(","); + String fieldOrder = ais.getOutputFieldOrder(); + String[] fieldOrderArray = StringUtils.isNullOrEmpty(fieldOrder) ? new String[]{} : fieldOrder.split(","); + + String header = "#chr\tposition\tref\talt\toriginal_alt\tGATK_GT\tGATK_AD\t" + String.join("\t", fieldOrderArray); + if (emptyHeadersArray.length > 0) { + header += "\t" + String.join("\t", emptyHeadersArray); + } + + boolean includeSearchTerm = ais.isIncludeSearchTerm(); + header += (includeSearchTerm ? "\tsearchTerm" : ""); + headers.add(header); + } + + return headers; + } + + public static String generateAdditionalEmptyValues(AnnotationInputs ais) { + String emptyHeaders = ais.getAdditionalEmptyFields(); + + if (StringUtils.isNullOrEmpty(emptyHeaders)) { + return ""; + } else { + return getEmptyHeaderValues(org.apache.commons.lang3.StringUtils.countMatches(emptyHeaders, ",") + 1); + } + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java index cace848dd..64d3d00a5 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationInputs.java @@ -31,6 +31,8 @@ public int getAnnotationSourceThreadCount() { } +// public record AnnotationInput(String file, int chrIndex, int positionIndex, int refIndex, +// int altIndex, boolean snpEffVcf, String fields, boolean chrStartsWithChr) {} public static class AnnotationInput { private String file; private int chrIndex; @@ -39,34 +41,40 @@ public static class AnnotationInput { private int altIndex; private boolean snpEffVcf; private String fields; - + private boolean chrStartsWithChr; + + public AnnotationInput() {} + public String getFile() { return file; } - + public int getChrIndex() { return chrIndex; } - + public String getFields() { return fields; } - + public int getPositionIndex() { return positionIndex; } - + public int getRefIndex() { return refIndex; } - + public int getAltIndex() { return altIndex; } - + public boolean isSnpEffVcf() { return snpEffVcf; } - - } + + public boolean isChrStartsWithChr() { + return chrStartsWithChr; + } +} } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java index 39cdda7c2..79e5fd12d 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSource.java @@ -14,233 +14,338 @@ import org.qcmg.common.model.ChrPosition; import org.qcmg.common.model.ChrPositionComparator; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.util.TabTokenizer; import org.qcmg.qio.record.RecordReader; public abstract class AnnotationSource implements Closeable { - - public static final String FIELD_DELIMITER_EQ = "="; - public static final char DEFAULT_DELIMITER = '\t'; - public static final String FIELD_DELIMITER_TAB = "\t"; - public static final Comparator COMP = ChrPositionComparator.getStringComparatorForHG38(); - - static final QLogger logger = QLoggerFactory.getLogger(AnnotationSource.class); - - List currentRecords; - List nextRecords; - ChrPosition currentCP; - ChrPosition nextCP; - - protected final RecordReader reader; - protected final Iterator iter; - protected final int chrPositionInRecord; - protected final int positionPositionInRecord; - protected final int refPositionInFile; - protected final int altPositionInFile; - - - public AnnotationSource(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile) { - super(); - this.reader = reader; - this.iter = reader.iterator(); - this.chrPositionInRecord = chrPositionInRecord - 1; - this.positionPositionInRecord = positionPositionInRecord - 1; - this.refPositionInFile = refPositionInFile - 1; - this.altPositionInFile = altPositionInFile - 1; - } - - - public abstract String annotationToReturn(String record); - - public static String getEmptyRecordReturnValue(String fieldNames) { - return Arrays.stream(fieldNames.split(",")).map(s -> s + "=").collect(Collectors.joining(FIELD_DELIMITER_TAB)); - } - - public String getAnnotation(ChrPosition requestedCp) { - - logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); - - - /* - * check to see if the records we currently have stored are a match - */ - if ( areCPsEqual(requestedCp, currentCP) == 0) { - - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturn(rec); - } - } - - } else if (null != nextCP && areCPsEqual(requestedCp, nextCP) < 0) { - /* - * requestedCp is "less than" next CP - * return empty list here - */ - } else { - logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); - getNextRecord(requestedCp); - if ( areCPsEqual(requestedCp, currentCP) == 0) { - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturn(rec); - } - } - } else { - /* - * requestedCP and currentCP are not equal - */ - } - } - return annotationToReturn(null); - } - - void getNextRecord(ChrPosition requestedCp) { - currentRecords = new ArrayList<>(); - - /* - * check to see if the nextCP meets our criteria - * if it does, set currentCP to be next CP - * we still need to iterate through the file to see if we have more than this entry - */ - int nextCPmatch = areCPsEqual(requestedCp, nextCP); - if (nextCPmatch == 0) { - currentCP = nextCP; - currentRecords = nextRecords; - } - - - while (iter.hasNext()) { - String nextRecord = iter.next(); - /* - * check to see if this record is the one we want - */ - String[] nextRecordArray = TabTokenizer.partialTokenize(nextRecord, DEFAULT_DELIMITER, Math.max(chrPositionInRecord, positionPositionInRecord) + 1); - - int match = isThisOurRecord(requestedCp, nextRecordArray, chrPositionInRecord, positionPositionInRecord); - - if (match == 0) { - /* - * got a match! - * we could have more than 1 entry for each position - */ - currentCP = getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); - currentRecords.add(nextRecord); - - } else if (match < 0) { - /* - * we have overshot - set nextCP and break out - */ - nextCP = getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); - nextRecords = new ArrayList<>(); - nextRecords.add(nextRecord); - break; - } else { - /* - * no match yet - keep going - */ - } - } - } - - public static ChrPosition getCpFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord) { - if (null == rec || rec.length == 0) { - throw new IllegalArgumentException("String array rec is null or empty"); - } - if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { - throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); - } - return new ChrPointPosition(rec[chrPositionInRecord], Integer.parseInt(rec[positionPositionInRecord])); - } - - /* - * 1 based numbering - * much like a compare method, this will return 0 if the requestedCp is the same as the rec, - * 1 if the requestedCp is upstream of the rec - * -1 if the requestedCp is downstream of the rec - */ - public static int isThisOurRecord(ChrPosition requestedCp, String[] recArray, int chrPositionInRecord, int positionPositionInRecord) { - - return isThisOurRecord(requestedCp, recArray[chrPositionInRecord], Integer.parseInt(recArray[positionPositionInRecord])); - } - - public static int isThisOurRecord(ChrPosition requestedCp, String recordChr, int recordPosition) { - if (null == requestedCp) { - return 1; - } - return compareChromosomeNameAndStartPositions(requestedCp.getChromosome(), requestedCp.getStartPosition(), recordChr, recordPosition); - } - - /** - * - * Compares chromosome names and positions - */ - public static int compareChromosomeNameAndStartPositions(String chr1, int position1, String chr2, int position2) { - if (null == chr1) { - return 1; - } - if (null == chr2) { - return -1; - } - boolean chr1StartsWithChr = chr1.startsWith("chr"); - boolean chr2StartsWithChr = chr2.startsWith("chr"); - int diff = COMP.compare((chr1StartsWithChr ? chr1.substring(3) : chr1), (chr2StartsWithChr ? chr2.substring(3) : chr2)); - if (diff != 0) { - return diff; - } - /* - * check position now - */ - return Integer.compare(position1, position2); - } - - - /** - * - * THis is effectively comparing the 2 supplied ChrPositions. - * If the first cp is null, 1 is returned. - * If the second cp is null, -1 is returned - * - * NOTE that if both cps supplied are null, 1 is returned (due to the first cp being null)! - * - * If they are both non-null, the contig names and start and end positions are compared - * - * - * @param cp1 - * @param cp2 - * @return - */ - public static int areCPsEqual(ChrPosition cp1 ,ChrPosition cp2) { - if (null == cp1) { - return 1; - } - if (null == cp2) { - return -1; - } - - int nameAndStartPositionMatch = compareChromosomeNameAndStartPositions(cp1.getChromosome(), cp1.getStartPosition(), cp2.getChromosome(), cp2.getStartPosition()); - if (nameAndStartPositionMatch != 0) { - return nameAndStartPositionMatch; - } - - return Integer.compare(cp1.getEndPosition(), cp2.getEndPosition()); - } + + public static final String FIELD_DELIMITER_EQ = "="; + public static final char DEFAULT_DELIMITER = '\t'; + public static final String FIELD_DELIMITER_TAB = "\t"; + public static final Comparator COMP = ChrPositionComparator.getChrNameComparatorNoChrsOneToM(); + + static final QLogger logger = QLoggerFactory.getLogger(AnnotationSource.class); + + List currentRecords; + List nextRecords; + long currentCPAsLong; + long nextCPAsLong; + + + protected final RecordReader reader; + protected final Iterator iter; + protected final int chrPositionInRecord; + protected final int positionPositionInRecord; + protected final int refPositionInFile; + protected final int altPositionInFile; + protected final boolean canUseStartsWith; + + protected final boolean chrStartsWithChr; + + + public AnnotationSource(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, int refPositionInFile, int altPositionInFile, boolean chrStartsWithChr) { + super(); + this.reader = reader; + this.iter = reader.iterator(); + this.chrPositionInRecord = chrPositionInRecord - 1; + this.positionPositionInRecord = positionPositionInRecord - 1; + this.refPositionInFile = refPositionInFile - 1; + this.altPositionInFile = altPositionInFile - 1; + this.canUseStartsWith = this.chrPositionInRecord == 0 && this.positionPositionInRecord == 1; + this.chrStartsWithChr = chrStartsWithChr; + } + + + public abstract String annotationToReturn(String[] record); + + public static String getEmptyRecordReturnValue(String fieldNames) { + return Arrays.stream(fieldNames.split(",")).map(s -> s + FIELD_DELIMITER_EQ).collect(Collectors.joining(FIELD_DELIMITER_TAB)); + } + + public String getAnnotation(long requestedCpAsLong, ChrPosition requestedCp) { + + logger.debug(reader.getFile().getName() + ": requestedCp is " + requestedCpAsLong + ", currentCP: " + currentCPAsLong + ", nextCP: " + nextCPAsLong); + + + /* + * check to see if the records we currently have stored are a match + */ + if (requestedCpAsLong == currentCPAsLong) { + + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ + return getAnnotationsFromCurrentRecords(requestedCp); + + } else { + int matchWithNextCP = Long.compare(requestedCpAsLong, nextCPAsLong); + if (nextCPAsLong > -1 && matchWithNextCP < 0) { + /* + * requestedCp is "less than" next CP + * return empty list here + */ + } else { +// logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); + getNextRecord(requestedCpAsLong, matchWithNextCP); + if (requestedCpAsLong == currentCPAsLong) { + return getAnnotationsFromCurrentRecords(requestedCp); + } + /* + * requestedCP and currentCP are not equal + */ + } + } + return annotationToReturn(null); + } + + private String getAnnotationsFromCurrentRecords(ChrPosition requestedCp) { + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturn(recArray); + } + } + } + return annotationToReturn(null); + } + + void getNextRecord(long requestedCpAsLong, int matchWithNextCP) { +// void getNextRecord(ChrPosition requestedCp, int matchWithNextCP) { + currentRecords = new ArrayList<>(4); + + /* + * check to see if the nextCP meets our criteria + * if it does, set currentCP to be next CP + * we still need to iterate through the file to see if we have more than this entry + */ + if (matchWithNextCP == 0) { + currentCPAsLong = nextCPAsLong; + currentRecords = nextRecords; + } + +// String startsWithString = (chrStartsWithChr ? requestedCp.getChromosome() : requestedCp.getChromosome().substring(3)); + while (iter.hasNext()) { + /* + * check to see if this record is the one we want + */ + String nextRecord = iter.next(); + int match; + String[] nextRecordArray = null; + if (canUseStartsWith) { + match = isThisOurRecordShortcut(requestedCpAsLong, nextRecord, chrStartsWithChr); + } else { + nextRecordArray = TabTokenizer.partialTokenize(nextRecord, DEFAULT_DELIMITER, Math.max(chrPositionInRecord, positionPositionInRecord) + 1); + match = isThisOurRecord(requestedCpAsLong, nextRecordArray, chrPositionInRecord, positionPositionInRecord, chrStartsWithChr); + } + if (match == 0) { + /* + * got a match! + * we could have more than 1 entry for each position + */ + currentCPAsLong = getChrPositionAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, nextRecord, chrStartsWithChr); + currentRecords.add(nextRecord); + + } else if (match < 0) { + /* + * we have overshot - set nextCP and break out + */ + nextCPAsLong = getChrPositionAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, nextRecord, chrStartsWithChr); + nextRecords = new ArrayList<>(); + nextRecords.add(nextRecord); + break; + } + /* + * no match yet - keep going + */ + } + } + + public static ChrPosition getChrPositionFromRecord(String[] nextRecordArray, int chrPositionInRecord, int positionPositionInRecord, String nextRecord) { + if (null == nextRecordArray) { + int firstTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER); + int secondTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1); + return new ChrPointPosition(nextRecord.substring(0, firstTabIndex), Integer.parseInt(nextRecord.substring(firstTabIndex + 1, secondTabIndex))); + } else { + return getCpFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord); + } + } + + public static long getChrPositionAsLongFromRecord(String[] nextRecordArray, int chrPositionInRecord, int positionPositionInRecord, String nextRecord, boolean chrStartsWithChr) { + if (null == nextRecordArray) { + int firstTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER); + int secondTabIndex = nextRecord.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1); + String contig = chrStartsWithChr ? nextRecord.substring(3, firstTabIndex) : nextRecord.substring(0, firstTabIndex); + int position = Integer.parseInt(nextRecord, firstTabIndex + 1, secondTabIndex, 10); + return ChrPositionUtils.convertContigAndPositionToLong(contig, position); + } else { + return getCpAsLongFromRecord(nextRecordArray, chrPositionInRecord, positionPositionInRecord, chrStartsWithChr); + } + } + + /** + * Determines if the given record is considered our record by comparing the requested chromosome position (cp) with the record's cp. + * The method compares the chromosome and start positions between the requested cp and the record's cp. + * The cp is represented as a long value, where the upper 32 bits represent the chromosome and the lower 32 bits represent the position. + * If the record's cp matches the requested cp or is downstream of the requested cp, 1 is returned. + * If the record's cp is upstream of the requested cp, -1 is returned. + * If the requested cp is -1, indicating a wildcard, 1 is returned. + * + * @param requestedCpAsLong The requested chromosome position as a long value. + * @param recordLine The record line to compare. + * @param chrStartsWithChr Indicates if the chromosome name in the record starts with "chr". + * @return 1 if the record is our record or downstream of the requested cp, -1 if it is upstream, 0 otherwise. + */ + public static int isThisOurRecordShortcut(long requestedCpAsLong, String recordLine, boolean chrStartsWithChr) { + if (requestedCpAsLong == -1) { + return 1; + } + int firstTabIndex = recordLine.indexOf(DEFAULT_DELIMITER); + int recordChrInt = ChrPositionUtils.convertContigNameToInt(recordLine.substring(chrStartsWithChr ? 3 : 0, firstTabIndex)); + + if (recordChrInt == requestedCpAsLong >>> 32) { + // same chromosome, examine the position + int position = Integer.parseInt(recordLine, firstTabIndex + 1, recordLine.indexOf(DEFAULT_DELIMITER, firstTabIndex + 1), 10); + return Integer.compare((int) (requestedCpAsLong & 0x00000000FFFFFFFFL), position); + } else { + // examine the chromosome only + return Integer.compare((int) (requestedCpAsLong >>> 32), recordChrInt); + } + } + + public static ChrPosition getCpFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord) { + if (null == rec || rec.length == 0) { + throw new IllegalArgumentException("String array rec is null or empty"); + } + if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { + throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); + } + return new ChrPointPosition(rec[chrPositionInRecord], Integer.parseInt(rec[positionPositionInRecord])); + } + + public static long getCpAsLongFromRecord(String[] rec, int chrPositionInRecord, int positionPositionInRecord, boolean chrStartsWithChr) { + if (null == rec || rec.length == 0) { + throw new IllegalArgumentException("String array rec is null or empty"); + } + if (Math.max(chrPositionInRecord, positionPositionInRecord) >= rec.length) { + throw new IllegalArgumentException("String array rec is of length: " + rec.length + ", and Math.max(chrPositionInRecord, positionPositionInRecord): " + Math.max(chrPositionInRecord, positionPositionInRecord)); + } + + String recordChr = chrStartsWithChr ? rec[chrPositionInRecord].substring(3) : rec[chrPositionInRecord]; + int position = Integer.parseInt(rec[positionPositionInRecord]); + return ChrPositionUtils.convertContigAndPositionToLong(recordChr, position); + } + + /* + * 1 based numbering + * much like a compare method, this will return 0 if the requestedCp is the same as the rec, + * 1 if the requestedCp is upstream of the rec + * -1 if the requestedCp is downstream of the rec + */ + public static int isThisOurRecord(long requestedCpAsLong, String[] recArray, int chrPositionInRecord, int positionPositionInRecord, boolean chrStartsWithChr) { + + return isThisOurRecord(requestedCpAsLong, recArray[chrPositionInRecord], Integer.parseInt(recArray[positionPositionInRecord]), chrStartsWithChr); + } + + public static int isThisOurRecord(long requestedCpAsLong, String recordChr, int recordPosition, boolean chrStartsWithChr) { + if (requestedCpAsLong == -1) { + return 1; + } + long recordAsLong = ChrPositionUtils.convertContigAndPositionToLong(chrStartsWithChr ? recordChr.substring(3) : recordChr, recordPosition); + return Long.compare(requestedCpAsLong, recordAsLong); + } + + /** + * Compares the chromosome name and start positions of two variants. + * + * @param chr1 The chromosome name of the first variant. + * @param position1 The start position of the first variant. + * @param chr2 The chromosome name of the second variant. + * @param position2 The start position of the second variant. + * @return 1 if the first variant is greater, -1 if the second variant is greater, 0 if they are equal. + */ + public static int compareChromosomeNameAndStartPositions(String chr1, int position1, String chr2, int position2) { + if (null == chr1) { + return 1; + } + if (null == chr2) { + return -1; + } + boolean chr1StartsWithChr = chr1.startsWith("chr"); + boolean chr2StartsWithChr = chr2.startsWith("chr"); + int diff = COMP.compare((chr1StartsWithChr ? chr1.substring(3) : chr1), (chr2StartsWithChr ? chr2.substring(3) : chr2)); + if (diff != 0) { + return diff; + } + /* + * check position now + */ + return Integer.compare(position1, position2); + } + + + /** + * Compares two ChrPosition objects and determines if they are equal. + * + * @param cp1 The first ChrPosition object to compare. + * @param cp2 The second ChrPosition object to compare. + * @return 0 if the two ChrPosition objects are equal, 1 if cp1 is greater than cp2 or cp2 is null, -1 if cp1 is smaller than cp2 or cp1 is null. + */ + public static int compareCPs(ChrPosition cp1, ChrPosition cp2) { + if (null == cp1) { + return 1; + } + if (null == cp2) { + return -1; + } + + if (areChrPointPositionsEqual(cp1, cp2, true)) { + return 0; + } + + int nameAndStartPositionMatch = compareChromosomeNameAndStartPositions(cp1.getChromosome(), cp1.getStartPosition(), cp2.getChromosome(), cp2.getStartPosition()); + if (nameAndStartPositionMatch != 0) { + return nameAndStartPositionMatch; + } + + return Integer.compare(cp1.getEndPosition(), cp2.getEndPosition()); + } + + /** + * Compares two ChrPosition objects and determines if their chromosome and start positions are equal. + * + * @param cp1 The first ChrPosition object to compare. + * @param cp2 The second ChrPosition object to compare. + * @param ignoreChromosome If true, removes 'chr' from chromosome name (if present) when comparing. + * @return True if the chromosome and start positions are equal, otherwise false. + */ + public static boolean areChrPointPositionsEqual(ChrPosition cp1, ChrPosition cp2, boolean ignoreChromosome) { + if (cp1 == null || cp2 == null) { + return false; + } + if (cp1.getStartPosition() == cp2.getStartPosition()) { + if (cp1.getChromosome().equals(cp2.getChromosome())) { + return true; + } + if (ignoreChromosome) { + boolean cp1StartsWithChr = cp1.getChromosome().startsWith("chr"); + boolean cp2StartsWithChr = cp2.getChromosome().startsWith("chr"); + if ((cp1StartsWithChr && cp2StartsWithChr) || (!cp1StartsWithChr && !cp2StartsWithChr)) { + return false; + } else { + return (cp1StartsWithChr ? cp1.getChromosome().substring(3) : cp1.getChromosome()).equals((cp2StartsWithChr ? cp2.getChromosome().substring(3) : cp2.getChromosome())); + } + } + } + return false; + } } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java index ae181370b..005a0875f 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceSnpEffVCF.java @@ -15,258 +15,294 @@ import org.qcmg.qio.record.RecordReader; public class AnnotationSourceSnpEffVCF extends AnnotationSource { - - public static final String FIELD_DELIMITER_SEMI_COLON = ";"; - - public static final Map SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS = Stream.of( - new SimpleEntry<>("alt", 0), - new SimpleEntry<>("annotation", 1), - new SimpleEntry<>("effect", 1), // annotation is also known as effect - new SimpleEntry<>("putative_impact", 2), - new SimpleEntry<>("gene_name", 3), - new SimpleEntry<>("gene_id", 4), - new SimpleEntry<>("feature_type", 5), - new SimpleEntry<>("feature_id", 6), - new SimpleEntry<>("transcript_biotype", 7), - new SimpleEntry<>("rank", 8), - new SimpleEntry<>("hgvs.c", 9), - new SimpleEntry<>("hgvs.p", 10), - new SimpleEntry<>("cdna_position", 11), - new SimpleEntry<>("cds_position", 12), - new SimpleEntry<>("protein_position", 13), - new SimpleEntry<>("distance_to_feature", 14), - new SimpleEntry<>("errors", 15), - new SimpleEntry<>("warnings", 15), - new SimpleEntry<>("information", 15)).collect(Collectors.toMap(SimpleEntry::getKey, SimpleEntry::getValue)); - - - - List annotationFields; - String emptyRecordResult; - - public AnnotationSourceSnpEffVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); - // TODO Auto-generated constructor stub - - if (StringUtils.isNullOrEmpty(fieldNames)) { - throw new IllegalArgumentException("Null or empty fieldNames parameter passed to AnnotationSourceVCF ctor"); - } - /* - * should check to ensure the header contains the request field names - */ - - annotationFields = Arrays.stream(fieldNames.split(",")).collect(Collectors.toList()); - emptyRecordResult = getEmptyRecordReturnValue(fieldNames); - } - - @Override - public String getAnnotation(ChrPosition requestedCp) { - - logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); - - /* - * check to see if the records we currently have stored are a match - */ - if ( areCPsEqual(requestedCp, currentCP) == 0) { - - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (recAlt.contains(",")) { - String [] recAltArray = recAlt.split(","); - for (String recAltValue : recAltArray) { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAltValue)) { - return annotationToReturnWithAlt(rec, recAltValue); - } - } - } else { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturnWithAlt(rec, recAlt); - } - } - } - - } else if (null != nextCP && areCPsEqual(requestedCp, nextCP) < 0) { - /* - * requestedCp is "less than" next CP - * return empty list here - */ - } else { - logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); - getNextRecord(requestedCp); - if ( areCPsEqual(requestedCp, currentCP) == 0) { - /* - * we match on position - * lets see if there are any records that match on ref and alt - */ - for (String rec : currentRecords) { - String [] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); - String recRef = recArray[refPositionInFile]; - String recAlt = recArray[altPositionInFile]; - - if (recAlt.contains(",")) { - String [] recAltArray = recAlt.split(","); - for (String recAltValue : recAltArray) { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAltValue)) { - return annotationToReturnWithAlt(rec, recAltValue); - } - } - } else { - if (((ChrPositionRefAlt)requestedCp).getRef().equals(recRef) && ((ChrPositionRefAlt)requestedCp).getAlt().equals(recAlt)) { - return annotationToReturnWithAlt(rec, recAlt); - } - } - } - } else { - /* - * requestedCP and currentCP are not equal - */ - } - } - return annotationToReturn(null); - } - - @Override - public String annotationToReturn(String record) { - if (null == record) { - return emptyRecordResult; - } - /* - * dealing with a vcf file and assuming that the required annotation fields are in the INFO field - * so get that and go from there. - */ - String [] recordArray = record.split("\t"); - String info = recordArray[7]; - String alt = recordArray[4]; - - /* - * entries in the INFO field are delimited by ';' - */ - logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); - return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); - } - public String annotationToReturnWithAlt(String record, String alt) { - if (null == record) { - return emptyRecordResult; - } - /* - * dealing with a vcf file and assuming that the required annotation fields are in the INFO field - * so get that and go from there. - */ - String [] recordArray = record.split("\t"); - String info = recordArray[7]; - - /* - * entries in the INFO field are delimited by ';' - */ - logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); - return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); - } - - - public static String extractFieldsFromInfoField(String info, List fields, String emptyInfoFieldResult, String alt) { - if (StringUtils.isNullOrEmptyOrMissingData(info)) { - return emptyInfoFieldResult; - } - - String dataToReturn = ""; - String worstConsequence = getWorstConsequence(info, alt); - - /* - * if we didn't have a match on alt, return the empty result - */ - if (StringUtils.isNullOrEmpty(worstConsequence)) { - return emptyInfoFieldResult; - } - - /* - * we have our consequence - * split by pipe and then get our fields - */ - String [] consequenceArray = TabTokenizer.tokenize(worstConsequence, '|'); - - for (String af : fields) { - if ( ! StringUtils.isNullOrEmpty(af)) { - - /* - * get position from map - */ - String aflc = af.toLowerCase(); - Integer arrayPosition = SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS.get(aflc); - if (null != arrayPosition && arrayPosition >= 0 && arrayPosition < consequenceArray.length) { - /* - * good - */ - String annotation = consequenceArray[arrayPosition]; - dataToReturn += ! dataToReturn.isEmpty() ? FIELD_DELIMITER_TAB + af + "=" + annotation : af + "=" + annotation; - } else { + + public static final String FIELD_DELIMITER_SEMI_COLON = ";"; + + public static final Map SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS = Stream.of( + new SimpleEntry<>("alt", 0), + new SimpleEntry<>("annotation", 1), + new SimpleEntry<>("effect", 1), // annotation is also known as effect + new SimpleEntry<>("putative_impact", 2), + new SimpleEntry<>("gene_name", 3), + new SimpleEntry<>("gene_id", 4), + new SimpleEntry<>("feature_type", 5), + new SimpleEntry<>("feature_id", 6), + new SimpleEntry<>("transcript_biotype", 7), + new SimpleEntry<>("rank", 8), + new SimpleEntry<>("hgvs.c", 9), + new SimpleEntry<>("hgvs.p", 10), + new SimpleEntry<>("cdna_position", 11), + new SimpleEntry<>("cds_position", 12), + new SimpleEntry<>("protein_position", 13), + new SimpleEntry<>("distance_to_feature", 14), + new SimpleEntry<>("errors", 15), + new SimpleEntry<>("warnings", 15), + new SimpleEntry<>("information", 15)).collect(Collectors.toMap(SimpleEntry::getKey, SimpleEntry::getValue)); + + + List annotationFields; + String emptyRecordResult; + + public AnnotationSourceSnpEffVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); + // TODO Auto-generated constructor stub + + if (StringUtils.isNullOrEmpty(fieldNames)) { + throw new IllegalArgumentException("Null or empty fieldNames parameter passed to AnnotationSourceVCF ctor"); + } + /* + * should check to ensure the header contains the request field names + */ + + annotationFields = Arrays.stream(fieldNames.split(",")).collect(Collectors.toList()); + emptyRecordResult = getEmptyRecordReturnValue(fieldNames); + } + + @Override + public String getAnnotation(long requestedCpAsLong, ChrPosition requestedCp) { + +// logger.debug(reader.getFile().getName() + ": requestedCp is " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null) + ", nextCP: " + (null != nextCP ? nextCP.toIGVString() : null)); + + /* + * check to see if the records we currently have stored are a match + */ + if (requestedCpAsLong == currentCPAsLong) { + + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ +// return getAnnotationsFromRecords(requestedCp); + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } + + } else { + int matchWithNextCP = Long.compare(requestedCpAsLong, nextCPAsLong); + if (nextCPAsLong > -1 && matchWithNextCP < 0) { + + } else { + +// logger.debug(reader.getFile().getName() + ": getting next record. requestedCp: " + (null != requestedCp ? requestedCp.toIGVString() : null) + ", currentCP: " + (null != currentCP ? currentCP.toIGVString() : null)); + getNextRecord(requestedCpAsLong, matchWithNextCP); + if (requestedCpAsLong == currentCPAsLong) { + /* + * we match on position + * lets see if there are any records that match on ref and alt + */ + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } +// return getAnnotationsFromRecords(requestedCp); + } + /* + * requestedCP and currentCP are not equal + */ + } + } + return annotationToReturn(null); + } + + private String getAnnotationsFromRecords(ChrPosition requestedCp){ + if (requestedCp instanceof ChrPositionRefAlt reqCpRefAlt) { + String reqRef = reqCpRefAlt.getRef(); + String reqAlt = reqCpRefAlt.getAlt(); + for (String rec : currentRecords) { + String[] recArray = TabTokenizer.tokenize(rec, DEFAULT_DELIMITER); + String recRef = recArray[refPositionInFile]; + String recAlt = recArray[altPositionInFile]; + + if (recAlt.contains(",")) { + String[] recAltArray = recAlt.split(","); + for (String recAltValue : recAltArray) { + if (reqRef.equals(recRef) && reqAlt.equals(recAltValue)) { + return annotationToReturnWithAlt(rec, recAltValue); + } + } + } else { + if (reqRef.equals(recRef) && reqAlt.equals(recAlt)) { + return annotationToReturnWithAlt(rec, recAlt); + } + } + } + } + return annotationToReturn(null); + } + + @Override + public String annotationToReturn(String[] record) { + if (null == record) { + return emptyRecordResult; + } + /* + * dealing with a vcf file and assuming that the required annotation fields are in the INFO field + * so get that and go from there. + */ +// String[] recordArray = record.split("\t"); + String info = record[7]; + String alt = record[4]; + + /* + * entries in the INFO field are delimited by ';' + */ + logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); + return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); + } + + public String annotationToReturnWithAlt(String record, String alt) { + if (null == record) { + return emptyRecordResult; + } + /* + * dealing with a vcf file and assuming that the required annotation fields are in the INFO field + * so get that and go from there. + */ + String[] recordArray = record.split("\t"); + String info = recordArray[7]; + + /* + * entries in the INFO field are delimited by ';' + */ + logger.debug("looking for annotations in info field: " + info + ", with alt: " + alt); + return extractFieldsFromInfoField(info, annotationFields, emptyRecordResult, alt); + } + + + public static String extractFieldsFromInfoField(String info, List fields, String emptyInfoFieldResult, String alt) { + if (StringUtils.isNullOrEmptyOrMissingData(info)) { + return emptyInfoFieldResult; + } + + StringBuilder dataToReturn = new StringBuilder(); + String worstConsequence = getWorstConsequence(info, alt); + + /* + * if we didn't have a match on alt, return the empty result + */ + if (StringUtils.isNullOrEmpty(worstConsequence)) { + return emptyInfoFieldResult; + } + + /* + * we have our consequence + * split by pipe and then get our fields + */ + String[] consequenceArray = TabTokenizer.tokenize(worstConsequence, '|'); + + for (String af : fields) { + if (!StringUtils.isNullOrEmpty(af)) { + + /* + * get position from map + */ + String aflc = af.toLowerCase(); + Integer arrayPosition = SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS.get(aflc); + if (null != arrayPosition && arrayPosition >= 0 && arrayPosition < consequenceArray.length) { + /* + * good + */ + String annotation = consequenceArray[arrayPosition]; + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + af + "=" + annotation : af + "=" + annotation); + } else { // System.out.println("Could not find field [" + af + "] in SNP_EFF_ANNOTATION_FIELDS_AND_POSITIONS map!"); // System.out.println("arrayPosition.intValue(): " + arrayPosition.intValue() + ", consequenceArray.length: " + consequenceArray.length); - } - - } - } - return dataToReturn.isEmpty() ? emptyInfoFieldResult : dataToReturn; - } - - /** - * @param info - * @param alt - * @return - */ - public static String getWorstConsequence(String info, String alt) { - /* - * SnpEff annotations are in the following format: - * ANN=|||||||,|||||||,|||||||| - * ie. a comma separated (ordered) list of consequences, which in turn are pipe delimited and contain the following columns: - * alt|effect|Putative_impact| - * - * - * - * snpEff sorts consequences as follows: - * Effect sort order. When multiple effects are reported, SnpEff sorts the effects the following way: - - * Putative impact: Effects having higher putative impact are first. - * Effect type: Effects assumed to be more deleterious effects first. - * Canonical transcript before non-canonical. - * Marker genomic coordinates (e.g. genes starting before first). - * - * - */ - - /* - * first get the consequence corresponding to this alt - * There will most likely be more than 1 - * Pick the first one as that is the one with the highest effect as decreed by snpEff - */ - int annoIndex = info.indexOf("ANN="); - int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, annoIndex); - String ann = info.substring(annoIndex + 4, end == -1 ? info.length() : end); - - - String [] annArray = ann.split(","); - String worstConsequence = ""; - for (String aa : annArray) { - if (aa.startsWith(alt)) { - worstConsequence = aa; - break; - } - } - return worstConsequence; - } - - @Override - public void close() throws IOException { - if (null != reader) { - reader.close(); - } - } - + } + + } + } + return (dataToReturn.isEmpty()) ? emptyInfoFieldResult : dataToReturn.toString(); + } + + /** + * @param info + * @param alt + * @return + */ + public static String getWorstConsequence(String info, String alt) { + /* + * SnpEff annotations are in the following format: + * ANN=|||||||,|||||||,|||||||| + * ie. a comma separated (ordered) list of consequences, which in turn are pipe delimited and contain the following columns: + * alt|effect|Putative_impact| + * + * + * + * snpEff sorts consequences as follows: + * Effect sort order. When multiple effects are reported, SnpEff sorts the effects the following way: + + * Putative impact: Effects having higher putative impact are first. + * Effect type: Effects assumed to be more deleterious effects first. + * Canonical transcript before non-canonical. + * Marker genomic coordinates (e.g. genes starting before first). + * + * + */ + + /* + * first get the consequence corresponding to this alt + * There will most likely be more than 1 + * Pick the first one as that is the one with the highest effect as decreed by snpEff + */ + int annoIndex = info.indexOf("ANN="); + int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, annoIndex); + String ann = info.substring(annoIndex + 4, end == -1 ? info.length() : end); + + + String[] annArray = ann.split(","); + String worstConsequence = ""; + for (String aa : annArray) { + if (aa.startsWith(alt)) { + worstConsequence = aa; + break; + } + } + return worstConsequence; + } + + @Override + public void close() throws IOException { + if (null != reader) { + reader.close(); + } + } + } diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java index b08036e81..3c050499c 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceTSV.java @@ -20,8 +20,8 @@ public class AnnotationSourceTSV extends AnnotationSource { Map headerNameAndPosition; public AnnotationSourceTSV(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); // TODO Auto-generated constructor stub if (StringUtils.isNullOrEmpty(fieldNames)) { @@ -58,12 +58,12 @@ public static String getLastHeaderLine(List headerLines) { /* * easy */ - header = headerLines.get(0); + header = headerLines.getFirst(); } else if (headerLines.size() > 1) { /* * going to assume that the last line contains the header line */ - header = headerLines.get(headerLines.size() - 1); + header = headerLines.getLast(); } return header; } @@ -88,8 +88,8 @@ public static Map getHeaderNameAndPositions(String fieldNames, } @Override - public String annotationToReturn(String record) { - if (null == record) { + public String annotationToReturn(String[] record) { + if (null == record || record.length == 0) { return emptyRecordResult; } @@ -99,20 +99,21 @@ public String annotationToReturn(String record) { return extractFieldsFromRecord(record, headerNameAndPosition); } - public static String extractFieldsFromRecord(String record, Map fields) { - String dataToReturn = ""; - if ( ! StringUtils.isNullOrEmpty(record) && null != fields) { - String [] recordArray = TabTokenizer.tokenize(record); + public static String extractFieldsFromRecord(String[] record, Map fields) { + StringBuilder dataToReturn = new StringBuilder(); + int recordLength = null != record ? record.length : 0; + if ( recordLength > 0 && null != fields) { +// String [] recordArray = TabTokenizer.tokenize(record); for (Entry entry : fields.entrySet()) { /* * make sure that array length is not shorter than entry value */ - if (recordArray.length > entry.getValue().intValue()) { - dataToReturn += (dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB : "") + entry.getKey() + "=" + recordArray[entry.getValue().intValue()]; + if (recordLength > entry.getValue()) { + dataToReturn.append(( ! dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB : "").append(entry.getKey()).append("=").append(record[entry.getValue()]); } } } - return dataToReturn; + return dataToReturn.toString(); } @Override diff --git a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java index af0cc652c..dc15768ab 100644 --- a/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java +++ b/qannotate/src/au/edu/qimr/qannotate/nanno/AnnotationSourceVCF.java @@ -18,8 +18,8 @@ public class AnnotationSourceVCF extends AnnotationSource { String emptyRecordResult; public AnnotationSourceVCF(RecordReader reader, int chrPositionInRecord, int positionPositionInRecord, - int refPositionInFile, int altPositionInFile, String fieldNames) { - super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile); + int refPositionInFile, int altPositionInFile, String fieldNames, boolean chrStartsWithChr) { + super(reader, chrPositionInRecord, positionPositionInRecord, refPositionInFile, altPositionInFile, chrStartsWithChr); // TODO Auto-generated constructor stub if (StringUtils.isNullOrEmpty(fieldNames)) { @@ -34,19 +34,15 @@ public AnnotationSourceVCF(RecordReader reader, int chrPositionInRecord, } @Override - public String annotationToReturn(String record) { - if (null == record) { + public String annotationToReturn(String [] record) { + if (null == record || record.length == 0) { return emptyRecordResult; } /* * dealing with a vcf file and assuming that the required annotation fields are in the INFO field * so get that and go from there. */ - String [] recordArray = record.split("\t"); -// if (recordArray.length <= 8) { -// System.out.println("vcf length <= 8: " + record); -// } - String info = recordArray[7]; + String info = record[7]; /* * entries in the INFO field are delimited by ';' @@ -59,23 +55,23 @@ public static String extractFieldsFromInfoField(String info, List fields if (StringUtils.isNullOrEmptyOrMissingData(info)) { return emptyInfoFieldResult; } - String dataToReturn = ""; + StringBuilder dataToReturn = new StringBuilder(); for (String af : fields) { if ( ! StringUtils.isNullOrEmpty(af)) { int start = info.indexOf(af + "="); if (start > -1) { int end = info.indexOf(FIELD_DELIMITER_SEMI_COLON, start); if (end == -1) { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + info.substring(start) : info.substring(start); + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + info.substring(start) : info.substring(start)); } else { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + info.substring(start, end) : info.substring(start, end); + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + info.substring(start, end) : info.substring(start, end)); } } else { - dataToReturn += dataToReturn.length() > 0 ? FIELD_DELIMITER_TAB + af + "=" : af + "="; + dataToReturn.append((!dataToReturn.isEmpty()) ? FIELD_DELIMITER_TAB + af + "=" : af + "="); } } } - return dataToReturn.length() == 0 ? emptyInfoFieldResult : dataToReturn; + return (dataToReturn.isEmpty()) ? emptyInfoFieldResult : dataToReturn.toString(); } @Override diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java index d422d5d1a..c8334a4f7 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotateTest.java @@ -1,8 +1,5 @@ package au.edu.qimr.qannotate.nanno; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.fail; - import java.io.BufferedWriter; import java.io.File; import java.io.FileWriter; @@ -19,8 +16,11 @@ import org.junit.rules.TemporaryFolder; import org.qcmg.common.commandline.Executor; import org.qcmg.common.model.ChrPositionRefAlt; +import org.qcmg.common.util.ChrPositionUtils; import org.qcmg.common.vcf.header.VcfHeaderUtils; +import static org.junit.Assert.*; + public class AnnotateTest { @@ -31,17 +31,18 @@ public class AnnotateTest { public void jsonInputs() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.vcf"); - createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4, true); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); - assertEquals(true, ais != null); - assertEquals(1, ais.getInputs().size()); + assertTrue(ais != null); + assert ais != null; + assertEquals(1, ais.getInputs().size()); List sources = new ArrayList<>(); AnnotateUtils.populateAnnotationSources(ais, sources); assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 95813205), new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); assertEquals("blah=", annotation); } @@ -49,7 +50,7 @@ public void jsonInputs() throws IOException { public void jsonInputsTSVMissingHeader() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.tsv"); - createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "blah", false, 3, 4, true); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); try { @@ -87,7 +88,7 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { // Given valid inputs, the method should process them without throwing exceptions File validInputJson = testFolder.newFile("valid_inputs.json"); File validAnnotationSource = testFolder.newFile("valid_annotation.vcf"); - createJsonInputs(validInputJson, validAnnotationSource, "valid", false, 3, 4); + createJsonInputs(validInputJson, validAnnotationSource, "valid", false, 3, 4, true); // When AnnotationInputs ais = AnnotateUtils.getInputs(validInputJson.getAbsolutePath()); @@ -98,9 +99,9 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { List sources = new ArrayList<>(); AnnotateUtils.populateAnnotationSources(ais, sources); - Assert.assertEquals(1, sources.size()); + assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 95813205), new ChrPositionRefAlt("chr1", 95813205, 95813205, "C", "T")); Assert.assertEquals("valid=", annotation); } @@ -108,13 +109,13 @@ public void jsonInputsShouldHandleValidInputs() throws IOException { public void jsonInputsTSV() throws IOException { File inputJson = testFolder.newFile("inputs.json"); File annotationSource = testFolder.newFile("annotation.tsv"); - createJsonInputs(inputJson, annotationSource, "aaref,HGVSc_VEP,HGVSp_VEP", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "aaref,HGVSc_VEP,HGVSp_VEP", false, 3, 4, false); createAnnotationFile(annotationSource, true); List sources = new ArrayList<>(2); AnnotationInputs ais = AnnotateUtils.getInputs(inputJson.getAbsolutePath()); AnnotateUtils.populateAnnotationSources(ais, sources); assertEquals(1, sources.size()); - String annotation = sources.get(0).getAnnotation(new ChrPositionRefAlt("chr1", 655652, 655652, "A", "T")); + String annotation = sources.getFirst().getAnnotation(ChrPositionUtils.convertContigAndPositionToLong("1", 655652), new ChrPositionRefAlt("chr1", 655652, 655652, "A", "T")); assertEquals("HGVSc_VEP=c.1A>C\tHGVSp_VEP=p.Met1?\taaref=M", annotation); } @@ -139,7 +140,7 @@ public void endToEnd() throws IOException { /* * json inputs - need annotationSource deets */ - createJsonInputs(inputJson, annotationSource, "aaref", false, 3, 4); + createJsonInputs(inputJson, annotationSource, "aaref", false, 3, 4, false); int exitValue = executeTest(inputVcf, inputJson, outputFile, logFile); assertEquals(1, exitValue); @@ -194,7 +195,7 @@ public void endToEndSnpEff() throws IOException { /* * json inputs - need annotationSource deets */ - createJsonInputs(inputJson, snpEffAnnotationSource, "gene_name,feature_id,feature_type,effect,cdna_position,cds_position,protein_position,putative_impact,hgvs.c,hgvs.p", true, 4, 5); + createJsonInputs(inputJson, snpEffAnnotationSource, "gene_name,feature_id,feature_type,effect,cdna_position,cds_position,protein_position,putative_impact,hgvs.c,hgvs.p", true, 4, 5, true); int exitValue = executeTest(inputVcf, inputJson, outputFile, logFile); assertEquals(0, exitValue); @@ -232,7 +233,7 @@ private int executeTest(File inputVcf, File inputJson, File outputFile, File log return 1; } - public static void createJsonInputs(File jsonFile, File annotationFile, String annotationFields, boolean snpEffAnnotationFile, int refPos, int altPos) throws IOException { + public static void createJsonInputs(File jsonFile, File annotationFile, String annotationFields, boolean snpEffAnnotationFile, int refPos, int altPos, boolean chrStartsWithChr) throws IOException { List data = Arrays.asList( "{", "\"outputFieldOrder\": \"" + annotationFields + "\",", @@ -242,6 +243,7 @@ public static void createJsonInputs(File jsonFile, File annotationFile, String a "\"inputs\": [{", "\"file\": \"" + annotationFile.getAbsolutePath() + "\",", "\"snpEffVcf\": " + snpEffAnnotationFile + ",", + "\"chrStartsWithChr\": " + chrStartsWithChr + ",", "\"chrIndex\": 1,", "\"positionIndex\": 2,", "\"refIndex\": " + refPos + ",", diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java index 44415ff0d..c1cc590a1 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTSVTest.java @@ -15,26 +15,26 @@ public class AnnotationSourceTSVTest { @Test public void extractFieldsFromRecord() { assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(null, null)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("", null)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("blah", null)); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{}, null)); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"blah"}, null)); assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(null, new HashMap<>())); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("", new HashMap<>())); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{}, new HashMap<>())); Map fields = new HashMap<>(); - fields.put("foo", Integer.valueOf(0)); - assertEquals("foo=short_record", AnnotationSourceTSV.extractFieldsFromRecord("short_record", fields)); - assertEquals("foo=slightly_longer", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord", fields)); - fields.put("foo", Integer.valueOf(10)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord", fields)); - assertEquals("foo=", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\t", fields)); - assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(2)); - assertEquals("foo=bar\tfoo2=", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(1)); - assertEquals("foo=bar\tfoo2=record", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo2", Integer.valueOf(11)); - assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); - fields.put("foo", Integer.valueOf(100)); - assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord("slightly_longer\trecord\t\t\t\t\t\t\t\t\tbar", fields)); + fields.put("foo", 0); + assertEquals("foo=short_record", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"short_record"}, fields)); + assertEquals("foo=slightly_longer", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record"}, fields)); + fields.put("foo", 10); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record"}, fields)); + assertEquals("foo=", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", ""}, fields)); + assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 2); + assertEquals("foo=bar\tfoo2=", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 1); + assertEquals("foo=bar\tfoo2=record", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo2", 11); + assertEquals("foo=bar", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer", "record", "", "", "", "", "", "", "", "", "bar"}, fields)); + fields.put("foo", 100); + assertEquals("", AnnotationSourceTSV.extractFieldsFromRecord(new String[]{"slightly_longer","record","","","","","","","","","bar"}, fields)); } diff --git a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java index 29319d2a0..c4e20d5fa 100644 --- a/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java +++ b/qannotate/test/au/edu/qimr/qannotate/nanno/AnnotationSourceTest.java @@ -4,6 +4,7 @@ import org.junit.Rule; import org.junit.Test; import org.junit.rules.TemporaryFolder; +import org.qcmg.common.model.ChrPointPosition; import org.qcmg.common.model.ChrPosition; import org.qcmg.common.util.ChrPositionUtils; @@ -14,114 +15,189 @@ import static org.junit.Assert.*; public class AnnotationSourceTest { - - @Rule - public final TemporaryFolder testFolder = new TemporaryFolder(); - - - @Test - public void compareNameAndPositions() { - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "chr1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "chr1", 1)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 1)); - assertEquals(-1, Integer.compare(1, 2)); - assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 2)); - assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "2", 2)); - assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("2", 2, "2", 2)); - assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("2", 3, "2", 2)); - assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("3", 32, "2", 2)); - } - - @Test - public void compareCPs() { - ChrPosition cp1 = null; - ChrPosition cp2 = null; - assertEquals(1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp1 = ChrPositionUtils.getChrPosition("chr1", 1, 1); - assertEquals(-1, AnnotationSource.areCPsEqual(cp1, cp2)); - cp2 = ChrPositionUtils.getChrPosition("chr1", 1, 1); - assertEquals(0, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr1", 2, 2); - assertEquals(-1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp1 = ChrPositionUtils.getChrPosition("chr1", 3, 3); - assertEquals(1, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr1", 3, 3); - assertEquals(0, AnnotationSource.areCPsEqual(cp1, cp2)); - - cp2 = ChrPositionUtils.getChrPosition("chr3", 3, 3); - assertEquals(-2, AnnotationSource.areCPsEqual(cp1, cp2)); - } - - @Test - public void isThisOurRecord() { - ChrPosition cp1 = ChrPositionUtils.getChrPosition("chr10", 1, 1); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, null, 0)); - assertEquals(1, AnnotationSource.isThisOurRecord(null, null, 0)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr10", 0)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr10", 2)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr11", 1)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr9", 1)); - assertEquals(0, AnnotationSource.isThisOurRecord(cp1, "chr10", 1)); - assertEquals(-1, AnnotationSource.isThisOurRecord(cp1, "chr10", 10)); - assertEquals(1, AnnotationSource.isThisOurRecord(cp1, "chr10", 0)); - } - - @Test - public void getCPFromStringArray() { - try { + + @Rule + public final TemporaryFolder testFolder = new TemporaryFolder(); + + + @Test + public void compareNameAndPositions() { + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "chr1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("chr1", 1, "1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "chr1", 1)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 1)); + assertEquals(-1, Integer.compare(1, 2)); + assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "1", 2)); + assertEquals(-1, AnnotationSource.compareChromosomeNameAndStartPositions("1", 1, "2", 2)); + assertEquals(0, AnnotationSource.compareChromosomeNameAndStartPositions("2", 2, "2", 2)); + assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("2", 3, "2", 2)); + assertEquals(1, AnnotationSource.compareChromosomeNameAndStartPositions("3", 32, "2", 2)); + } + + @Test + public void testIsThisOurRecordShortcut() { + ChrPosition cp = new ChrPointPosition("chr1", 1000); + long cpAsLong = ChrPositionUtils.convertContigAndPositionToLong("chr1", 1000); + + // Match chr, within bounds + int result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t1000\tATGC", true); + assertEquals(0, result); + + // Match chr, but out of bounds + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t2500\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr1\t500\tATGC", true); + Assert.assertEquals(1, result); + + // No chr match + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t500\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t1000\tATGC", true); + assertEquals(-1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr2\t1500\tATGC", true); + assertEquals(-1, result); + + // Null ChrPosition + result = AnnotationSource.isThisOurRecordShortcut(-1, "chr1\t1500\tATGC", true); + assertEquals(1, result); + + cp = new ChrPointPosition("chr10", 246987); + cpAsLong = ChrPositionUtils.convertContigAndPositionToLong("chr10", 246987); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr9\t138122079\tATGC", true); + assertEquals(1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t1000\tATGC", true); + assertEquals(1, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t246987\tATGC", true); + assertEquals(0, result); + result = AnnotationSource.isThisOurRecordShortcut(cpAsLong, "chr10\t2469870\tATGC", true); + assertEquals(-1, result); + } + + @Test + public void compareCPs() { + ChrPosition cp1 = null; + ChrPosition cp2 = null; + assertEquals(1, AnnotationSource.compareCPs(cp1, cp2)); + + cp1 = ChrPositionUtils.getChrPosition("chr1", 1, 1); + assertEquals(-1, AnnotationSource.compareCPs(cp1, cp2)); + cp2 = ChrPositionUtils.getChrPosition("chr1", 1, 1); + assertEquals(0, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr1", 2, 2); + assertEquals(-1, AnnotationSource.compareCPs(cp1, cp2)); + + cp1 = ChrPositionUtils.getChrPosition("chr1", 3, 3); + assertEquals(1, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr1", 3, 3); + assertEquals(0, AnnotationSource.compareCPs(cp1, cp2)); + + cp2 = ChrPositionUtils.getChrPosition("chr3", 3, 3); + assertEquals(-2, AnnotationSource.compareCPs(cp1, cp2)); + } + + @Test + public void isThisOurRecord() { + ChrPosition cp1 = ChrPositionUtils.getChrPosition("chr10", 1, 1); + long l = ChrPositionUtils.convertContigAndPositionToLong("10", 1); +// assertEquals(-1, AnnotationSource.isThisOurRecord(l, null, 0, false)); + assertEquals(1, AnnotationSource.isThisOurRecord(-1, null, 0, false)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr10", 0, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr10", 2, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr11", 1, true)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr9", 1, true)); + assertEquals(0, AnnotationSource.isThisOurRecord(l, "chr10", 1, true)); + assertEquals(-1, AnnotationSource.isThisOurRecord(l, "chr10", 10, true)); + assertEquals(1, AnnotationSource.isThisOurRecord(l, "chr10", 0, true)); + } + + @Test + public void getCPFromStringArray() { + try { assertNull(AnnotationSource.getCpFromRecord(null, 0, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - try { + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + try { assertNull(AnnotationSource.getCpFromRecord(new String[]{}, 0, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - ChrPosition cp = ChrPositionUtils.getChrPosition("1", 1, 1); - assertEquals(cp, AnnotationSource.getCpFromRecord(new String[]{"1"}, 0, 0)); - assertEquals(ChrPositionUtils.getChrPosition("1", 0, 0), AnnotationSource.getCpFromRecord(new String[]{"1", "0"}, 0, 1)); - assertEquals(ChrPositionUtils.getChrPosition("1", 2, 2), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 0, 1)); - assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 1, 0)); - try { - assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 10, 0)); - Assert.fail("Should have thrown IAE"); - } catch (IllegalArgumentException iae) {} - } - - @Test - public void vcfFields() { - String info = ""; - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("",""), "EMPTY")); - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", Arrays.asList("",""), "EMPTY")); - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", List.of(""), "EMPTY")); - info = "ALLELEID=75079;CLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202;CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided;CLNHGVS=NC_000001.11:g.1232279A>G;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=OMIM_Allelic_Variant:615291.0001;GENEINFO=B3GALT6:126792;MC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant;ORIGIN=1;RS=786200938"; - assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of(""), "EMPTY")); - assertEquals("ALLELEID=75079", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of("ALLELEID"), "EMPTY")); - assertEquals("ALLELEID=75079\tCLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("ALLELEID","CLNDISDB"), "EMPTY")); - assertEquals("CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided\tMC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("CLNDN","MC"), "EMPTY")); - info = "ALLELEID=1211496;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.19251559C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=EMC1:23065;ORIGIN=1"; - assertEquals("MC=\tCLNDN=not_provided", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("MC","CLNDN"), "EMPTY")); - } - - @Test - public void tsvGetHeader() { - String header = "#chr pos(1-based) ref alt aaref aaalt rs_dbSNP151 hg19_chr hg19_pos(1-based) hg18_chr hg18_pos(1-based) aapos genename Ensembl_geneid Ensembl_transcriptid Ensembl_proteinid Uniprot_acc Uniprot_entry HGVSc_ANNOVAR HGVSp_ANNOVAR HGVSc_snpEff HGVSp_snpEff HGVSc_VEP HGVSp_VEP APPRIS GENCODE_basic TSL VEP_canonical cds_strand refcodon codonpos codon_degeneracy Ancestral_allele AltaiNeandertal Denisova VindijiaNeandertal SIFT_score SIFT_converted_rankscore SIFT_pred SIFT4G_score SIFT4G_converted_rankscore SIFT4G_pred Polyphen2_HDIV_score Polyphen2_HDIV_rankscore Polyphen2_HDIV_pred Polyphen2_HVAR_score Polyphen2_HVAR_rankscore Polyphen2_HVAR_pred LRT_score LRT_converted_rankscore LRT_pred LRT_Omega MutationTaster_score MutationTaster_converted_rankscore MutationTaster_pred MutationTaster_model MutationTaster_AAE MutationAssessor_score MutationAssessor_rankscore MutationAssessor_pred FATHMM_score FATHMM_converted_rankscore FATHMM_pred PROVEAN_score PROVEAN_converted_rankscore PROVEAN_pred VEST4_score VEST4_rankscore MetaSVM_score MetaSVM_rankscore MetaSVM_pred MetaLR_score MetaLR_rankscore MetaLR_pred Reliability_index M-CAP_score M-CAP_rankscore M-CAP_pred REVEL_score REVEL_rankscore MutPred_score MutPred_rankscore MutPred_protID MutPred_AAchange MutPred_Top5features MVP_score MVP_rankscore MPC_score MPC_rankscore PrimateAI_score PrimateAI_rankscore PrimateAI_pred DEOGEN2_score DEOGEN2_rankscore DEOGEN2_pred BayesDel_addAF_score BayesDel_addAF_rankscore BayesDel_addAF_pred BayesDel_noAF_score BayesDel_noAF_rankscore BayesDel_noAF_pred ClinPred_score ClinPred_rankscore ClinPred_pred LIST-S2_score LIST-S2_rankscore LIST-S2_pred Aloft_Fraction_transcripts_affected Aloft_prob_Tolerant Aloft_prob_Recessive Aloft_prob_Dominant Aloft_pred Aloft_Confidence CADD_raw CADD_raw_rankscore CADD_phred CADD_raw_hg19 CADD_raw_rankscore_hg19 CADD_phred_hg19 DANN_score DANN_rankscore fathmm-MKL_coding_score fathmm-MKL_coding_rankscore fathmm-MKL_coding_pred fathmm-MKL_coding_group fathmm-XF_coding_score fathmm-XF_coding_rankscore fathmm-XF_coding_pred Eigen-raw_coding Eigen-raw_coding_rankscore Eigen-phred_coding Eigen-PC-raw_coding Eigen-PC-raw_coding_rankscore Eigen-PC-phred_coding GenoCanyon_score GenoCanyon_rankscore integrated_fitCons_score integrated_fitCons_rankscore integrated_confidence_value GM12878_fitCons_score GM12878_fitCons_rankscore GM12878_confidence_value H1-hESC_fitCons_score H1-hESC_fitCons_rankscore H1-hESC_confidence_value HUVEC_fitCons_score HUVEC_fitCons_rankscore HUVEC_confidence_value LINSIGHT LINSIGHT_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore phyloP100way_vertebrate phyloP100way_vertebrate_rankscore phyloP30way_mammalian phyloP30way_mammalian_rankscore phyloP17way_primate phyloP17way_primate_rankscore phastCons100way_vertebrate phastCons100way_vertebrate_rankscore phastCons30way_mammalian phastCons30way_mammalian_rankscore phastCons17way_primatephastCons17way_primate_rankscore SiPhy_29way_pi SiPhy_29way_logOdds SiPhy_29way_logOdds_rankscore bStatistic bStatistic_converted_rankscore 1000Gp3_AC 1000Gp3_AF 1000Gp3_AFR_AC 1000Gp3_AFR_AF 1000Gp3_EUR_AC 1000Gp3_EUR_AF 1000Gp3_AMR_AC 1000Gp3_AMR_AF 1000Gp3_EAS_AC 1000Gp3_EAS_AF 1000Gp3_SAS_AC 1000Gp3_SAS_AF TWINSUK_AC TWINSUK_AF ALSPAC_AC ALSPAC_AF UK10K_AC UK10K_AF ESP6500_AA_AC ESP6500_AA_AF ESP6500_EA_AC ESP6500_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF ExAC_nonTCGA_AC ExAC_nonTCGA_AF ExAC_nonTCGA_Adj_AC ExAC_nonTCGA_Adj_AF ExAC_nonTCGA_AFR_AC ExAC_nonTCGA_AFR_AF ExAC_nonTCGA_AMR_AC ExAC_nonTCGA_AMR_AF ExAC_nonTCGA_EAS_AC ExAC_nonTCGA_EAS_AF ExAC_nonTCGA_FIN_AC ExAC_nonTCGA_FIN_AF ExAC_nonTCGA_NFE_AC ExAC_nonTCGA_NFE_AF ExAC_nonTCGA_SAS_AC ExAC_nonTCGA_SAS_AF ExAC_nonpsych_AC ExAC_nonpsych_AF ExAC_nonpsych_Adj_AC ExAC_nonpsych_Adj_AF ExAC_nonpsych_AFR_AC ExAC_nonpsych_AFR_AF ExAC_nonpsych_AMR_AC ExAC_nonpsych_AMR_AF ExAC_nonpsych_EAS_AC ExAC_nonpsych_EAS_AF ExAC_nonpsych_FIN_AC ExAC_nonpsych_FIN_AF ExAC_nonpsych_NFE_AC ExAC_nonpsych_NFE_AF ExAC_nonpsych_SAS_AC ExAC_nonpsych_SAS_AF gnomAD_exomes_flag gnomAD_exomes_AC gnomAD_exomes_AN gnomAD_exomes_AF gnomAD_exomes_nhomalt gnomAD_exomes_AFR_AC gnomAD_exomes_AFR_AN gnomAD_exomes_AFR_AF gnomAD_exomes_AFR_nhomalt gnomAD_exomes_AMR_AC gnomAD_exomes_AMR_AN gnomAD_exomes_AMR_AF gnomAD_exomes_AMR_nhomalt gnomAD_exomes_ASJ_AC gnomAD_exomes_ASJ_AN gnomAD_exomes_ASJ_AF gnomAD_exomes_ASJ_nhomalt gnomAD_exomes_EAS_AC gnomAD_exomes_EAS_AN gnomAD_exomes_EAS_AF gnomAD_exomes_EAS_nhomalt gnomAD_exomes_FIN_AC gnomAD_exomes_FIN_AN gnomAD_exomes_FIN_AF gnomAD_exomes_FIN_nhomalt gnomAD_exomes_NFE_AC gnomAD_exomes_NFE_AN gnomAD_exomes_NFE_AF gnomAD_exomes_NFE_nhomalt gnomAD_exomes_SAS_AC gnomAD_exomes_SAS_AN gnomAD_exomes_SAS_AF gnomAD_exomes_SAS_nhomalt gnomAD_exomes_POPMAX_AC gnomAD_exomes_POPMAX_AN gnomAD_exomes_POPMAX_AF gnomAD_exomes_POPMAX_nhomalt gnomAD_exomes_controls_AC gnomAD_exomes_controls_AN gnomAD_exomes_controls_AF gnomAD_exomes_controls_nhomalt gnomAD_exomes_controls_AFR_AC gnomAD_exomes_controls_AFR_AN gnomAD_exomes_controls_AFR_AF gnomAD_exomes_controls_AFR_nhomalt gnomAD_exomes_controls_AMR_AC gnomAD_exomes_controls_AMR_AN gnomAD_exomes_controls_AMR_AF gnomAD_exomes_controls_AMR_nhomalt gnomAD_exomes_controls_ASJ_AC gnomAD_exomes_controls_ASJ_AN gnomAD_exomes_controls_ASJ_AF gnomAD_exomes_controls_ASJ_nhomalt gnomAD_exomes_controls_EAS_AC gnomAD_exomes_controls_EAS_AN gnomAD_exomes_controls_EAS_AF gnomAD_exomes_controls_EAS_nhomalt gnomAD_exomes_controls_FIN_AC gnomAD_exomes_controls_FIN_AN gnomAD_exomes_controls_FIN_AF gnomAD_exomes_controls_FIN_nhomalt gnomAD_exomes_controls_NFE_AC gnomAD_exomes_controls_NFE_AN gnomAD_exomes_controls_NFE_AF gnomAD_exomes_controls_NFE_nhomalt gnomAD_exomes_controls_SAS_AC gnomAD_exomes_controls_SAS_AN gnomAD_exomes_controls_SAS_AF gnomAD_exomes_controls_SAS_nhomalt gnomAD_exomes_controls_POPMAX_AC gnomAD_exomes_controls_POPMAX_AN gnomAD_exomes_controls_POPMAX_AF gnomAD_exomes_controls_POPMAX_nhomalt gnomAD_genomes_flag gnomAD_genomes_AC gnomAD_genomes_AN gnomAD_genomes_AF gnomAD_genomes_nhomalt gnomAD_genomes_AFR_AC gnomAD_genomes_AFR_AN gnomAD_genomes_AFR_AF gnomAD_genomes_AFR_nhomalt gnomAD_genomes_AMR_AC gnomAD_genomes_AMR_AN gnomAD_genomes_AMR_AF gnomAD_genomes_AMR_nhomalt gnomAD_genomes_ASJ_AC gnomAD_genomes_ASJ_AN gnomAD_genomes_ASJ_AF gnomAD_genomes_ASJ_nhomalt gnomAD_genomes_EAS_AC gnomAD_genomes_EAS_AN gnomAD_genomes_EAS_AF gnomAD_genomes_EAS_nhomalt gnomAD_genomes_FIN_AC gnomAD_genomes_FIN_AN gnomAD_genomes_FIN_AF gnomAD_genomes_FIN_nhomalt gnomAD_genomes_NFE_AC gnomAD_genomes_NFE_AN gnomAD_genomes_NFE_AF gnomAD_genomes_NFE_nhomalt gnomAD_genomes_AMI_AC gnomAD_genomes_AMI_AN gnomAD_genomes_AMI_AF gnomAD_genomes_AMI_nhomalt gnomAD_genomes_SAS_AC gnomAD_genomes_SAS_AN gnomAD_genomes_SAS_AF gnomAD_genomes_SAS_nhomalt gnomAD_genomes_POPMAX_AC gnomAD_genomes_POPMAX_AN gnomAD_genomes_POPMAX_AF gnomAD_genomes_POPMAX_nhomalt clinvar_id clinvar_clnsig clinvar_trait clinvar_review clinvar_hgvs clinvar_var_source clinvar_MedGen_id clinvar_OMIM_id clinvar_Orphanet_id Interpro_domain GTEx_V8_gene GTEx_V8_tissueGeuvadis_eQTL_target_gene"; - Map headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr", header); - assertEquals(1, headerMap.size()); + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + ChrPosition cp = ChrPositionUtils.getChrPosition("1", 1, 1); + assertEquals(cp, AnnotationSource.getCpFromRecord(new String[]{"1"}, 0, 0)); + assertEquals(ChrPositionUtils.getChrPosition("1", 0, 0), AnnotationSource.getCpFromRecord(new String[]{"1", "0"}, 0, 1)); + assertEquals(ChrPositionUtils.getChrPosition("1", 2, 2), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 0, 1)); + assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 1, 0)); + try { + assertEquals(ChrPositionUtils.getChrPosition("2", 1, 1), AnnotationSource.getCpFromRecord(new String[]{"1", "2"}, 10, 0)); + Assert.fail("Should have thrown IAE"); + } catch (IllegalArgumentException iae) { + } + } + + @Test + public void vcfFields() { + String info = ""; + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("", ""), "EMPTY")); + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", Arrays.asList("", ""), "EMPTY")); + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(".", List.of(""), "EMPTY")); + info = "ALLELEID=75079;CLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202;CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided;CLNHGVS=NC_000001.11:g.1232279A>G;CLNREVSTAT=criteria_provided,_multiple_submitters,_no_conflicts;CLNSIG=Pathogenic;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=OMIM_Allelic_Variant:615291.0001;GENEINFO=B3GALT6:126792;MC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant;ORIGIN=1;RS=786200938"; + assertEquals("EMPTY", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of(""), "EMPTY")); + assertEquals("ALLELEID=75079", AnnotationSourceVCF.extractFieldsFromInfoField(info, List.of("ALLELEID"), "EMPTY")); + assertEquals("ALLELEID=75079\tCLNDISDB=MONDO:MONDO:0010075,MedGen:C4017377,OMIM:271640|MONDO:MONDO:0014139,MedGen:C3809210,OMIM:615349,Orphanet:ORPHA536467|MONDO:MONDO:0019675,MedGen:C0432243,OMIM:PS271640,Orphanet:ORPHA93359,SNOMED_CT:254100000|MedGen:CN517202", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("ALLELEID", "CLNDISDB"), "EMPTY")); + assertEquals("CLNDN=Spondyloepimetaphyseal_dysplasia_with_joint_laxity,_type_1,_with_or_without_fractures|Ehlers-Danlos_syndrome,_spondylodysplastic_type,_2|Spondyloepimetaphyseal_dysplasia_with_joint_laxity|not_provided\tMC=SO:0001582|initiatior_codon_variant,SO:0001583|missense_variant", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("CLNDN", "MC"), "EMPTY")); + info = "ALLELEID=1211496;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=NC_000001.11:g.19251559C>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Benign;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;GENEINFO=EMC1:23065;ORIGIN=1"; + assertEquals("MC=\tCLNDN=not_provided", AnnotationSourceVCF.extractFieldsFromInfoField(info, Arrays.asList("MC", "CLNDN"), "EMPTY")); + } + + @Test + public void tsvGetHeader() { + String header = "#chr pos(1-based) ref alt aaref aaalt rs_dbSNP151 hg19_chr hg19_pos(1-based) hg18_chr hg18_pos(1-based) aapos genename Ensembl_geneid Ensembl_transcriptid Ensembl_proteinid Uniprot_acc Uniprot_entry HGVSc_ANNOVAR HGVSp_ANNOVAR HGVSc_snpEff HGVSp_snpEff HGVSc_VEP HGVSp_VEP APPRIS GENCODE_basic TSL VEP_canonical cds_strand refcodon codonpos codon_degeneracy Ancestral_allele AltaiNeandertal Denisova VindijiaNeandertal SIFT_score SIFT_converted_rankscore SIFT_pred SIFT4G_score SIFT4G_converted_rankscore SIFT4G_pred Polyphen2_HDIV_score Polyphen2_HDIV_rankscore Polyphen2_HDIV_pred Polyphen2_HVAR_score Polyphen2_HVAR_rankscore Polyphen2_HVAR_pred LRT_score LRT_converted_rankscore LRT_pred LRT_Omega MutationTaster_score MutationTaster_converted_rankscore MutationTaster_pred MutationTaster_model MutationTaster_AAE MutationAssessor_score MutationAssessor_rankscore MutationAssessor_pred FATHMM_score FATHMM_converted_rankscore FATHMM_pred PROVEAN_score PROVEAN_converted_rankscore PROVEAN_pred VEST4_score VEST4_rankscore MetaSVM_score MetaSVM_rankscore MetaSVM_pred MetaLR_score MetaLR_rankscore MetaLR_pred Reliability_index M-CAP_score M-CAP_rankscore M-CAP_pred REVEL_score REVEL_rankscore MutPred_score MutPred_rankscore MutPred_protID MutPred_AAchange MutPred_Top5features MVP_score MVP_rankscore MPC_score MPC_rankscore PrimateAI_score PrimateAI_rankscore PrimateAI_pred DEOGEN2_score DEOGEN2_rankscore DEOGEN2_pred BayesDel_addAF_score BayesDel_addAF_rankscore BayesDel_addAF_pred BayesDel_noAF_score BayesDel_noAF_rankscore BayesDel_noAF_pred ClinPred_score ClinPred_rankscore ClinPred_pred LIST-S2_score LIST-S2_rankscore LIST-S2_pred Aloft_Fraction_transcripts_affected Aloft_prob_Tolerant Aloft_prob_Recessive Aloft_prob_Dominant Aloft_pred Aloft_Confidence CADD_raw CADD_raw_rankscore CADD_phred CADD_raw_hg19 CADD_raw_rankscore_hg19 CADD_phred_hg19 DANN_score DANN_rankscore fathmm-MKL_coding_score fathmm-MKL_coding_rankscore fathmm-MKL_coding_pred fathmm-MKL_coding_group fathmm-XF_coding_score fathmm-XF_coding_rankscore fathmm-XF_coding_pred Eigen-raw_coding Eigen-raw_coding_rankscore Eigen-phred_coding Eigen-PC-raw_coding Eigen-PC-raw_coding_rankscore Eigen-PC-phred_coding GenoCanyon_score GenoCanyon_rankscore integrated_fitCons_score integrated_fitCons_rankscore integrated_confidence_value GM12878_fitCons_score GM12878_fitCons_rankscore GM12878_confidence_value H1-hESC_fitCons_score H1-hESC_fitCons_rankscore H1-hESC_confidence_value HUVEC_fitCons_score HUVEC_fitCons_rankscore HUVEC_confidence_value LINSIGHT LINSIGHT_rankscore GERP++_NR GERP++_RS GERP++_RS_rankscore phyloP100way_vertebrate phyloP100way_vertebrate_rankscore phyloP30way_mammalian phyloP30way_mammalian_rankscore phyloP17way_primate phyloP17way_primate_rankscore phastCons100way_vertebrate phastCons100way_vertebrate_rankscore phastCons30way_mammalian phastCons30way_mammalian_rankscore phastCons17way_primatephastCons17way_primate_rankscore SiPhy_29way_pi SiPhy_29way_logOdds SiPhy_29way_logOdds_rankscore bStatistic bStatistic_converted_rankscore 1000Gp3_AC 1000Gp3_AF 1000Gp3_AFR_AC 1000Gp3_AFR_AF 1000Gp3_EUR_AC 1000Gp3_EUR_AF 1000Gp3_AMR_AC 1000Gp3_AMR_AF 1000Gp3_EAS_AC 1000Gp3_EAS_AF 1000Gp3_SAS_AC 1000Gp3_SAS_AF TWINSUK_AC TWINSUK_AF ALSPAC_AC ALSPAC_AF UK10K_AC UK10K_AF ESP6500_AA_AC ESP6500_AA_AF ESP6500_EA_AC ESP6500_EA_AF ExAC_AC ExAC_AF ExAC_Adj_AC ExAC_Adj_AF ExAC_AFR_AC ExAC_AFR_AF ExAC_AMR_AC ExAC_AMR_AF ExAC_EAS_AC ExAC_EAS_AF ExAC_FIN_AC ExAC_FIN_AF ExAC_NFE_AC ExAC_NFE_AF ExAC_SAS_AC ExAC_SAS_AF ExAC_nonTCGA_AC ExAC_nonTCGA_AF ExAC_nonTCGA_Adj_AC ExAC_nonTCGA_Adj_AF ExAC_nonTCGA_AFR_AC ExAC_nonTCGA_AFR_AF ExAC_nonTCGA_AMR_AC ExAC_nonTCGA_AMR_AF ExAC_nonTCGA_EAS_AC ExAC_nonTCGA_EAS_AF ExAC_nonTCGA_FIN_AC ExAC_nonTCGA_FIN_AF ExAC_nonTCGA_NFE_AC ExAC_nonTCGA_NFE_AF ExAC_nonTCGA_SAS_AC ExAC_nonTCGA_SAS_AF ExAC_nonpsych_AC ExAC_nonpsych_AF ExAC_nonpsych_Adj_AC ExAC_nonpsych_Adj_AF ExAC_nonpsych_AFR_AC ExAC_nonpsych_AFR_AF ExAC_nonpsych_AMR_AC ExAC_nonpsych_AMR_AF ExAC_nonpsych_EAS_AC ExAC_nonpsych_EAS_AF ExAC_nonpsych_FIN_AC ExAC_nonpsych_FIN_AF ExAC_nonpsych_NFE_AC ExAC_nonpsych_NFE_AF ExAC_nonpsych_SAS_AC ExAC_nonpsych_SAS_AF gnomAD_exomes_flag gnomAD_exomes_AC gnomAD_exomes_AN gnomAD_exomes_AF gnomAD_exomes_nhomalt gnomAD_exomes_AFR_AC gnomAD_exomes_AFR_AN gnomAD_exomes_AFR_AF gnomAD_exomes_AFR_nhomalt gnomAD_exomes_AMR_AC gnomAD_exomes_AMR_AN gnomAD_exomes_AMR_AF gnomAD_exomes_AMR_nhomalt gnomAD_exomes_ASJ_AC gnomAD_exomes_ASJ_AN gnomAD_exomes_ASJ_AF gnomAD_exomes_ASJ_nhomalt gnomAD_exomes_EAS_AC gnomAD_exomes_EAS_AN gnomAD_exomes_EAS_AF gnomAD_exomes_EAS_nhomalt gnomAD_exomes_FIN_AC gnomAD_exomes_FIN_AN gnomAD_exomes_FIN_AF gnomAD_exomes_FIN_nhomalt gnomAD_exomes_NFE_AC gnomAD_exomes_NFE_AN gnomAD_exomes_NFE_AF gnomAD_exomes_NFE_nhomalt gnomAD_exomes_SAS_AC gnomAD_exomes_SAS_AN gnomAD_exomes_SAS_AF gnomAD_exomes_SAS_nhomalt gnomAD_exomes_POPMAX_AC gnomAD_exomes_POPMAX_AN gnomAD_exomes_POPMAX_AF gnomAD_exomes_POPMAX_nhomalt gnomAD_exomes_controls_AC gnomAD_exomes_controls_AN gnomAD_exomes_controls_AF gnomAD_exomes_controls_nhomalt gnomAD_exomes_controls_AFR_AC gnomAD_exomes_controls_AFR_AN gnomAD_exomes_controls_AFR_AF gnomAD_exomes_controls_AFR_nhomalt gnomAD_exomes_controls_AMR_AC gnomAD_exomes_controls_AMR_AN gnomAD_exomes_controls_AMR_AF gnomAD_exomes_controls_AMR_nhomalt gnomAD_exomes_controls_ASJ_AC gnomAD_exomes_controls_ASJ_AN gnomAD_exomes_controls_ASJ_AF gnomAD_exomes_controls_ASJ_nhomalt gnomAD_exomes_controls_EAS_AC gnomAD_exomes_controls_EAS_AN gnomAD_exomes_controls_EAS_AF gnomAD_exomes_controls_EAS_nhomalt gnomAD_exomes_controls_FIN_AC gnomAD_exomes_controls_FIN_AN gnomAD_exomes_controls_FIN_AF gnomAD_exomes_controls_FIN_nhomalt gnomAD_exomes_controls_NFE_AC gnomAD_exomes_controls_NFE_AN gnomAD_exomes_controls_NFE_AF gnomAD_exomes_controls_NFE_nhomalt gnomAD_exomes_controls_SAS_AC gnomAD_exomes_controls_SAS_AN gnomAD_exomes_controls_SAS_AF gnomAD_exomes_controls_SAS_nhomalt gnomAD_exomes_controls_POPMAX_AC gnomAD_exomes_controls_POPMAX_AN gnomAD_exomes_controls_POPMAX_AF gnomAD_exomes_controls_POPMAX_nhomalt gnomAD_genomes_flag gnomAD_genomes_AC gnomAD_genomes_AN gnomAD_genomes_AF gnomAD_genomes_nhomalt gnomAD_genomes_AFR_AC gnomAD_genomes_AFR_AN gnomAD_genomes_AFR_AF gnomAD_genomes_AFR_nhomalt gnomAD_genomes_AMR_AC gnomAD_genomes_AMR_AN gnomAD_genomes_AMR_AF gnomAD_genomes_AMR_nhomalt gnomAD_genomes_ASJ_AC gnomAD_genomes_ASJ_AN gnomAD_genomes_ASJ_AF gnomAD_genomes_ASJ_nhomalt gnomAD_genomes_EAS_AC gnomAD_genomes_EAS_AN gnomAD_genomes_EAS_AF gnomAD_genomes_EAS_nhomalt gnomAD_genomes_FIN_AC gnomAD_genomes_FIN_AN gnomAD_genomes_FIN_AF gnomAD_genomes_FIN_nhomalt gnomAD_genomes_NFE_AC gnomAD_genomes_NFE_AN gnomAD_genomes_NFE_AF gnomAD_genomes_NFE_nhomalt gnomAD_genomes_AMI_AC gnomAD_genomes_AMI_AN gnomAD_genomes_AMI_AF gnomAD_genomes_AMI_nhomalt gnomAD_genomes_SAS_AC gnomAD_genomes_SAS_AN gnomAD_genomes_SAS_AF gnomAD_genomes_SAS_nhomalt gnomAD_genomes_POPMAX_AC gnomAD_genomes_POPMAX_AN gnomAD_genomes_POPMAX_AF gnomAD_genomes_POPMAX_nhomalt clinvar_id clinvar_clnsig clinvar_trait clinvar_review clinvar_hgvs clinvar_var_source clinvar_MedGen_id clinvar_OMIM_id clinvar_Orphanet_id Interpro_domain GTEx_V8_gene GTEx_V8_tissueGeuvadis_eQTL_target_gene"; + Map headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr", header); + assertEquals(1, headerMap.size()); assertTrue(headerMap.containsKey("chr")); assertTrue(headerMap.containsValue(0)); - - headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr,genename", header); - assertEquals(2, headerMap.size()); + + headerMap = AnnotationSourceTSV.getHeaderNameAndPositions("chr,genename", header); + assertEquals(2, headerMap.size()); assertTrue(headerMap.containsKey("chr")); assertTrue(headerMap.containsValue(0)); assertTrue(headerMap.containsKey("genename")); - assertEquals(Integer.valueOf(12), headerMap.get("genename")); - } - - + assertEquals(Integer.valueOf(12), headerMap.get("genename")); + } + + @Test + public void testAreChrPointPositionsEqual() { + + // test both chromosomes and start positions are identical + ChrPosition chrPos1 = new ChrPointPosition("chr1", 12345); // assumes chr, start, and end as properties + ChrPosition chrPos2 = new ChrPointPosition("chr1", 12345); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + // test for different chromosomes, start positions are identical + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("1", 12345); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + chrPos1 = new ChrPointPosition("1", 12345); + chrPos2 = new ChrPointPosition("1", 12345); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertTrue(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + + // test for different chromosomes, start positions are identical + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("chr2", 12345); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + + // test for identical chromosomes, start positions are different + chrPos1 = new ChrPointPosition("chr1", 12345); + chrPos2 = new ChrPointPosition("chr1", 23456); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, false)); + assertFalse(AnnotationSource.areChrPointPositionsEqual(chrPos1, chrPos2, true)); + } } diff --git a/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java b/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java index 7c5787377..8fd4f1bd8 100644 --- a/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java +++ b/qcommon/src/org/qcmg/common/model/ChrPositionComparator.java @@ -1,165 +1,221 @@ /** * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. -*/ + */ package org.qcmg.common.model; +import java.io.Serial; import java.io.Serializable; -import java.util.Arrays; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; +import java.util.*; import org.qcmg.common.string.StringUtils; import org.qcmg.common.vcf.VcfRecord; -public class ChrPositionComparator implements Comparator, Serializable { - - /** - * not very sure. At moment I accept the old version of this comparator, so always use the default uid. - */ - private static final long serialVersionUID = 1L; - private static final ReferenceNameComparator COMPARATOR = new ReferenceNameComparator(); - public static final List contigs = Collections.unmodifiableList(Arrays.asList("chr1","chr2", "chr3","chr4","chr5","chr6","chr7","chr8","chr9", - "chr10","chr11","chr12","chr13","chr14","chr15","chr16","chr17","chr18","chr19","chr20","chr21","chr22","chrX","chrY", - "GL000191.1","GL000192.1","GL000193.1","GL000194.1","GL000195.1","GL000196.1","GL000197.1","GL000198.1","GL000199.1", - "GL000200.1","GL000201.1","GL000202.1","GL000203.1","GL000204.1","GL000205.1","GL000206.1","GL000207.1","GL000208.1", - "GL000209.1","GL000210.1","GL000211.1","GL000212.1","GL000213.1","GL000214.1","GL000215.1","GL000216.1","GL000217.1", - "GL000218.1","GL000219.1","GL000220.1","GL000221.1","GL000222.1","GL000223.1","GL000224.1","GL000225.1","GL000226.1", - "GL000227.1","GL000228.1","GL000229.1","GL000230.1","GL000231.1","GL000232.1","GL000233.1","GL000234.1","GL000235.1", - "GL000236.1","GL000237.1","GL000238.1","GL000239.1","GL000240.1","GL000241.1","GL000242.1","GL000243.1","GL000244.1", - "GL000245.1","GL000246.1","GL000247.1","GL000248.1","GL000249.1","chrMT")); - - public static final List HG38_CONTIGS = Collections.unmodifiableList(Arrays.asList("1","2", "3","4","5","6","7","8","9", - "10","11","12","13","14","15","16","17","18","19","20","21","22","X","Y","M")); - - @Override - public int compare(ChrPosition o1, ChrPosition o2) { - int chromosomeDiff = COMPARATOR.compare(o1.getChromosome(), o2.getChromosome()); - if (chromosomeDiff != 0) - return chromosomeDiff; - - int positionDiff = o1.getStartPosition() - o2.getStartPosition(); - if (positionDiff != 0) - return positionDiff; - - return o1.getEndPosition() - o2.getEndPosition(); - } - - /** - * Creates a ChrPosition comparator that is based on the chromosome name comparator supplied as an argument. - * Allows the the user to be flexible as to how ChrPosition objects are compared - * @param chrNameComp - * @return - */ - public static Comparator getComparator(Comparator chrNameComp) { - - return Comparator.comparing(ChrPosition::getChromosome, chrNameComp) - .thenComparingInt(ChrPosition::getStartPosition) - .thenComparingInt(ChrPosition::getEndPosition); - } - - - /** - * This method is useful if you have a list of contigs whose order you want to preserve. - * eg. a sorted bam will in its header have a list of contigs, and it is possible that you would like to sort chromosome (Strings) based on this order - * - * If the list is empty of null, then then @link ReferenceNameComparator comparator will be returned. - * - * @param chrNameComp - * @return - */ - public static Comparator getChrNameComparator(List list) { - - return (null == list || list.isEmpty()) ? COMPARATOR : - new Comparator() { - @Override - public int compare(String o1, String o2) { - int i1 = list.indexOf(o1); - int i2 = list.indexOf(o2); - if (i1 >= 0 && i2 >= 0) { - return i1 - i2; - } else if (i1 >= 0 && i2 == -1) { - // o1.chr in list but not o2.chr => o1 < o2 - return -1; - } else if (i1 == -1 && i2 >= 0) { - // o2.chr in list but not o1.chr => o2 < o1 - return 1; - } else { - assert i1 == -1 && i2 == -1; - // neither o1 nor o2 chr in list => "natural" ordering - return o1.compareTo(o2); - } - } - }; - - } - - /** - * Return a comparator for VCF records, preserving the order according to the supplied - * list of contigs. If the CHROM value of record A is in the list but that of record B isn't - * then record A sorts earlier than the record B. If the CHROM value of neither A nor B is in - * the list then the records are sorted according to the "natural" order given by - * `ChrPositionComparator.compare(o1, o2)` - */ - public static Comparator getVcfRecordComparator(List list) { - - return (null == list || list.isEmpty()) ? null : - new Comparator() { - private final ChrPositionComparator chrPosComp = new ChrPositionComparator(); - @Override - public int compare(VcfRecord o1, VcfRecord o2) { - ChrPosition o1Pos = o1.getChrPosition(); - ChrPosition o2Pos = o2.getChrPosition(); - int i1 = list.indexOf(o1Pos.getChromosome()); - int i2 = list.indexOf(o2Pos.getChromosome()); - if (i1 >= 0 && i2 >= 0) { - // o1 & o2 chr in list => order by chr in list then pos - int diff = i1 - i2; - if (diff == 0) { - diff = o1Pos.getStartPosition() - o2Pos.getStartPosition(); - } - return diff; - } else if (i1 >= 0 && i2 == -1) { - // o1.chr in list but not o2.chr => o1 < o2 - return -1; - } else if (i1 == -1 && i2 >= 0) { - // o2.chr in list but not o1.chr => o2 < o1 - return 1; - } else { - assert i1 == -1 && i2 == -1; - // neither o1 nor o2 chr in list => "natural" ordering - return chrPosComp.compare(o1Pos, o2Pos); - } - - } - }; - } - - /** - * Convenience method to return a VCFRecord comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getVcfRecordComparatorForGRCh37() { - return getVcfRecordComparator(contigs); - } - /** - * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getCPComparatorForGRCh37() { - return getComparator(getChrNameComparator(contigs)); - } - /** - * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB - * - * @return - */ - public static Comparator getStringComparatorForHG38() { - return getChrNameComparator(HG38_CONTIGS); - } +public class ChrPositionComparator implements Comparator, Serializable { + + @Serial + private static final long serialVersionUID = 1L; + private static final ReferenceNameComparator COMPARATOR = new ReferenceNameComparator(); + public static final List contigs = List.of("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22", "chrX", "chrY", "GL000191.1", "GL000192.1", "GL000193.1", "GL000194.1", "GL000195.1", "GL000196.1", "GL000197.1", "GL000198.1", "GL000199.1", "GL000200.1", "GL000201.1", "GL000202.1", "GL000203.1", "GL000204.1", "GL000205.1", "GL000206.1", "GL000207.1", "GL000208.1", "GL000209.1", "GL000210.1", "GL000211.1", "GL000212.1", "GL000213.1", "GL000214.1", "GL000215.1", "GL000216.1", "GL000217.1", "GL000218.1", "GL000219.1", "GL000220.1", "GL000221.1", "GL000222.1", "GL000223.1", "GL000224.1", "GL000225.1", "GL000226.1", "GL000227.1", "GL000228.1", "GL000229.1", "GL000230.1", "GL000231.1", "GL000232.1", "GL000233.1", "GL000234.1", "GL000235.1", "GL000236.1", "GL000237.1", "GL000238.1", "GL000239.1", "GL000240.1", "GL000241.1", "GL000242.1", "GL000243.1", "GL000244.1", "GL000245.1", "GL000246.1", "GL000247.1", "GL000248.1", "GL000249.1", "chrMT"); + public static final Map HG37_CONTIGS_MAP; + + public static final Map HG38_CONTIGS_MAP = Map.ofEntries(Map.entry("1", 1), Map.entry("2", 2), Map.entry("3", 3), Map.entry("4", 4), Map.entry("5", 5), Map.entry("6", 6), Map.entry("7", 7), Map.entry("8", 8), Map.entry("9", 9), Map.entry("10", 10), Map.entry("11", 11), Map.entry("12", 12), Map.entry("13", 13), Map.entry("14", 14), Map.entry("15", 15), Map.entry("16", 16), Map.entry("17", 17), Map.entry("18", 18), Map.entry("19", 19), Map.entry("20", 20), Map.entry("21", 21), Map.entry("22", 22), Map.entry("X", 23), Map.entry("Y", 24), Map.entry("M", 25)); + + static { + Map temp = new HashMap<>(); + for (int i = 1; i <= 22; i++) { + temp.put("chr" + i, i); + } + temp.put("chrX", 23); + temp.put("chrY", 24); + for (int i = 191, j = 25; i <= 249; i++, j++) { + temp.put("GL000" + i + ".1", j); + } + HG37_CONTIGS_MAP = Collections.unmodifiableMap(temp); + } + @Override + public int compare(ChrPosition o1, ChrPosition o2) { + int chromosomeDiff = COMPARATOR.compare(o1.getChromosome(), o2.getChromosome()); + if (chromosomeDiff != 0) return chromosomeDiff; + + int positionDiff = o1.getStartPosition() - o2.getStartPosition(); + if (positionDiff != 0) return positionDiff; + + return o1.getEndPosition() - o2.getEndPosition(); + } + + /** + * Creates a ChrPosition comparator that is based on the chromosome name comparator supplied as an argument. + * Allows the user to be flexible as to how ChrPosition objects are compared + * + */ + public static Comparator getComparator(Comparator chrNameComp) { + + return Comparator.comparing(ChrPosition::getChromosome, chrNameComp).thenComparingInt(ChrPosition::getStartPosition).thenComparingInt(ChrPosition::getEndPosition); + } + + + /** + * This method is useful if you have a list of contigs whose order you want to preserve. + * eg. a sorted bam will in its header have a list of contigs, and it is possible that you would like to sort chromosome (Strings) based on this order + *

+ * If the list is empty of null, then @link ReferenceNameComparator comparator will be returned. + * + */ + public static Comparator getChrNameComparator(List list) { + + return (null == list || list.isEmpty()) ? COMPARATOR : (o1, o2) -> { + int i1 = list.indexOf(o1); + int i2 = list.indexOf(o2); + if (i1 >= 0 && i2 >= 0) { + return i1 - i2; + } else if (i1 >= 0) { + // o1.chr in list but not o2.chr => o1 < o2 + return -1; + } else if (i2 >= 0) { + // o2.chr in list but not o1.chr => o2 < o1 + return 1; + } else { + // neither o1 nor o2 chr in list => "natural" ordering + return o1.compareTo(o2); + } + }; + + } + + /** + * Returns a comparator for sorting chromosome names based on a map of chromosome names and their corresponding positions. + * The comparator sorts the chromosome names based on the positions in the map. If a chromosome name is not present in the map, + * it is treated as having a position of -1 and is sorted after the chromosome names present in the map. + * + * @param map a map of chromosome names and their corresponding positions + * @return a comparator for sorting chromosome names based on the positions in the map + */ + public static Comparator getChrNameComparator(Map map) { + + if (null == map || map.isEmpty()) { + return COMPARATOR; + } + + return (o1, o2) -> { + int i1 = map.getOrDefault(o1, -1); + int i2 = map.getOrDefault(o2, -1); + + if (i1 == -1){ + return (i2 == -1) ? o1.compareTo(o2) : 1; + } + if (i2 == -1){ + return -1; + } + return i1 - i2; + }; + + } + + /** + * Return a comparator for VCF records, preserving the order according to the supplied + * list of contigs. If the CHR value of record A is in the list but that of record B isn't + * then record A sorts earlier than the record B. If the CHR value of neither A nor B is in + * the list then the records are sorted according to the "natural" order given by + * `ChrPositionComparator.compare(o1, o2)` + */ + public static Comparator getVcfRecordComparator(List list) { + + return (null == list || list.isEmpty()) ? null : new Comparator<>() { + private final ChrPositionComparator chrPosComp = new ChrPositionComparator(); + + @Override + public int compare(VcfRecord o1, VcfRecord o2) { + ChrPosition o1Pos = o1.getChrPosition(); + ChrPosition o2Pos = o2.getChrPosition(); + int i1 = list.indexOf(o1Pos.getChromosome()); + int i2 = list.indexOf(o2Pos.getChromosome()); + if (i1 >= 0 && i2 >= 0) { + // o1 & o2 chr in list => order by chr in list then pos + int diff = i1 - i2; + if (diff == 0) { + diff = o1Pos.getStartPosition() - o2Pos.getStartPosition(); + } + return diff; + } else if (i1 >= 0) { + // o1.chr in list but not o2.chr => o1 < o2 + return -1; + } else if (i2 >= 0) { + // o2.chr in list but not o1.chr => o2 < o1 + return 1; + } else { + // neither o1 nor o2 chr in list => "natural" ordering + return chrPosComp.compare(o1Pos, o2Pos); + } + + } + }; + } + + /** + * Convenience method to return a VCFRecord comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB + * + */ + public static Comparator getVcfRecordComparatorForGRCh37() { + return getVcfRecordComparator(contigs); + } + + /** + * Convenience method to return a ChrPosition comparator based on the GRCh37_ICGC_standard_v2.fa reference file used at QIMRB + * + */ + public static Comparator getCPComparatorForGRCh37() { + return getComparator(getChrNameComparator(HG37_CONTIGS_MAP)); + } + + /** + * Convenience method to return a ChrPosition comparator based on the GRCh38 reference file used at QIMRB + * + */ + public static Comparator getStringComparatorForHG38() { + return getChrNameComparator(HG38_CONTIGS_MAP); + } + + /** + * Returns a comparator for sorting chromosome names, from "1" to "M" (inclusive). + * + * The comparator compares chromosome names based on the following rules: + * - If both names are numeric (e.g., "2", "10"), they are sorted numerically. + * - If one name is numeric and the other is not, the numeric name is sorted first. + * - If both names are non-numeric, they are sorted lexicographically. + * + * @return the chromosome name comparator + */ + public static Comparator getChrNameComparatorNoChrsOneToM() { + + return (o1, o2) -> { + + int i1 = Character.isDigit(o1.charAt(0)) ? Integer.parseInt(o1) : -1; + int i2 = Character.isDigit(o2.charAt(0)) ? Integer.parseInt(o2) : -1; + if (i1 > -1 && i2 > -1) { + return i1 - i2; + } + + if (i1 == -1){ + i1 = o1.equals("X") ? 23 : o1.equals("Y") ? 24 : o1.equals("M") ? 25 : -1; + } + if (i2 == -1){ + i2 = o2.equals("X") ? 23 : o2.equals("Y") ? 24 : o2.equals("M") ? 25 : -1; + } + + if (i1 > -1) { + if (i2 > -1) { + return i1 - i2; + } else { + return -1; + } + } else if (i2 > -1) { + return 1; + } + return o1.compareTo(o2); + }; + + } } diff --git a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java index e6f6a9cc0..8bd252bda 100644 --- a/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java +++ b/qcommon/src/org/qcmg/common/util/ChrPositionUtils.java @@ -30,6 +30,60 @@ public static boolean doChrPositionsOverlap(ChrPosition a, ChrPosition b) { return doChrPositionsOverlap(a, b, 0); } + /** + * convert ChrPosition to a long. Will only examine the contig and start position + * + */ + public static long convertContigAndPositionToLong(String contig, int position) { + long l = (long) convertContigNameToInt(contig) << 32; + return l + position; + } + + public static ChrPosition convertLongToChrPosition(long l) { + int contig = (int) (l >> 32); + int position = (int) l; + + if (contig == 23) { + return ChrPointPosition.valueOf("X", position); + } else if (contig == 24) { + return ChrPointPosition.valueOf("Y", position); + } else if (contig == 25) { + return ChrPointPosition.valueOf("M", position); + } else if (contig > 25) { + return ChrPointPosition.valueOf("-1", position); + } + return ChrPointPosition.valueOf(Integer.toString(contig), position); + } + + /** + * Converts a contig name to an integer representing the contig. + * Assumes that the contig name is either a number or a string that does not start with "chr". + * + * @param contigName the name of the contig + * @return an integer representation of the contig + */ + public static int convertContigNameToInt(String contigName) { + if (null == contigName || contigName.isEmpty()) { + throw new IllegalArgumentException("null or empty contig name supplied to convertContigNameToInt"); + } + int i = Character.isDigit(contigName.charAt(0)) ? Integer.parseInt(contigName) : -1; + if (i > -1) { + return i; + } + + if (contigName.length() > 3 && contigName.startsWith("chr")) { + return convertContigNameToInt(contigName.substring(3)); + } + + return switch (contigName) { + case "X" -> 23; + case "Y" -> 24; + case "M" -> 25; + case "MT" -> 25; + default -> contigName.hashCode(); + }; + } + /** * Checks if two ChrPosition objects overlap with a buffer. * @@ -103,8 +157,8 @@ public static ChrPosition createCPFromCosmic(String cosmicCoords) { } else { int colonIndex = cosmicCoords.indexOf(':'); int minusIndex = cosmicCoords.indexOf('-'); - int start = Integer.parseInt(cosmicCoords.substring(colonIndex + 1, minusIndex)); - int end = Integer.parseInt(cosmicCoords.substring(minusIndex + 1)); + int start = Integer.parseInt(cosmicCoords, colonIndex + 1, minusIndex, 10); + int end = Integer.parseInt(cosmicCoords, minusIndex + 1, cosmicCoords.length(), 10); return getChrPosition("chr" + cosmicCoords.substring(0, colonIndex), start, end); } } @@ -167,7 +221,6 @@ public static ChrPosition cloneWithNewChromosomeName(ChrPosition cp, String newC /** * Converts a string in the format "chr1:12345-12345" to a ChrRangePosition object. - * The string must represent a range on the chromosome (start position does not equal end position). * * @param position the string to convert * @return the corresponding ChrRangePosition object @@ -185,8 +238,8 @@ public static ChrRangePosition getChrPositionFromString(String position) { } String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); + int start = Integer.parseInt(position, colonPos + 1, minusPos, 10); + int end = Integer.parseInt(position, minusPos + 1, position.length(), 10); return new ChrRangePosition(chr, start, end); } @@ -212,41 +265,12 @@ public static ChrPositionName getChrPositionNameFromString(String position, Stri } String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); + int start = Integer.parseInt(position, colonPos + 1, minusPos, 10); + int end = Integer.parseInt(position, minusPos + 1, position.length(), 10); return new ChrPositionName(chr, start, end, name); } - /** - * Converts a string in the format "chr1:12345-12345" to a ChrPointPosition object. - * The string must represent a single point on the chromosome (start position equals end position). - * - * @param position the string to convert - * @return the corresponding ChrPointPosition object - * @throws IllegalArgumentException if the string is null, empty, not in the correct format, or represents a range rather than a single point - */ - public static ChrPointPosition getChrPointPositionFromString(String position) { - if (StringUtils.isNullOrEmpty(position)) - throw new IllegalArgumentException("Null or empty string passed to getChrPositionFromString()"); - - int colonPos = position.indexOf(':'); - int minusPos = position.indexOf('-'); - - if (colonPos == -1 || minusPos == -1) { - throw new IllegalArgumentException("invalid string passed to getChrPositionFromString() - must be in chr1:12345-23456 format: " + position); - } - - String chr = position.substring(0, colonPos); - int start = Integer.parseInt(position.substring(colonPos + 1, minusPos)); - int end = Integer.parseInt(position.substring(minusPos + 1)); - if (start != end) { - throw new IllegalArgumentException("Start and end position in getChrPointPositionFromString are not the same. Start: " + start + ", end: " + end + ", from string: " + position); - } - - return ChrPointPosition.valueOf(chr, start); - } - /** * Returns a new ChrPosition object that precedes the given ChrPosition. * The start and end positions of the new ChrPosition are each one less than the corresponding positions of the given ChrPosition. @@ -258,25 +282,6 @@ public static ChrPosition getPrecedingChrPosition(ChrPosition cp) { return new ChrRangePosition(cp.getChromosome(), cp.getStartPosition() - 1, cp.getEndPosition() - 1); } - /** - * Returns a map of ChrPointPosition objects based on the contents of the supplied String array - * - * @param positions - * @return - */ - public static Map getChrPointPositionsFromStrings(String[] positions) { - - if (null == positions || positions.length == 0) - throw new IllegalArgumentException("null or empty string array passed to getChrPositionsFromStrings"); - - Map chrPositions = new HashMap<>(); - for (String s : positions) { - ChrPosition cpp = getChrPointPositionFromString(s); - chrPositions.put(cpp, cpp); - } - return chrPositions; - } - /** * Converts a ChrPosition and additional data to a VCF string. * diff --git a/qcommon/src/org/qcmg/common/util/TabTokenizer.java b/qcommon/src/org/qcmg/common/util/TabTokenizer.java index 3ab20a7ef..34e659093 100644 --- a/qcommon/src/org/qcmg/common/util/TabTokenizer.java +++ b/qcommon/src/org/qcmg/common/util/TabTokenizer.java @@ -1,7 +1,7 @@ /** * © Copyright The University of Queensland 2010-2014. * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. */ package org.qcmg.common.util; @@ -12,177 +12,177 @@ import java.util.NoSuchElementException; public class TabTokenizer { - - private static final char DELIM = '\t'; - private static final String[] stringArrayType = new String[] {}; - - public static String[] tokenize(final String data) { - return tokenize(data, DELIM); - } - public static String[] tokenize(final String data, int requiredEntries) { - return tokenize(data, DELIM, requiredEntries); - } - - public static String[] tokenize(final String data, final char delim) { - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(); - - resultList.add(data.substring(currentIndex, nextIndex)); - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while (nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - // get last string - resultList.add(data.substring(currentIndex)); - - return resultList.toArray(stringArrayType); - } - - public static String[] tokenize(final String data, final char delim, final int requiredEntries) { - int noOfEntries = 0; - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(); - - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while ((noOfEntries <= requiredEntries) && nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - // get last string - resultList.add(data.substring(currentIndex)); - - return resultList.toArray(stringArrayType); - } - - public static String[] partialTokenize(final String data, final char delim, final int requiredEntries) { - int noOfEntries = 0; - int nextIndex = data.indexOf(delim); - if (nextIndex < 0) { - /* - * rather than throw an IllegalArgumentExcpetion, return an array with the data as the only element - */ - return new String[]{data}; - } - - int currentIndex = 0; - final List resultList = new ArrayList<>(requiredEntries + 1); - - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - - nextIndex = data.indexOf(delim, currentIndex); - while ((noOfEntries < requiredEntries) && nextIndex != -1) { - resultList.add(data.substring(currentIndex, nextIndex)); - noOfEntries++; - currentIndex = nextIndex + 1; - nextIndex = data.indexOf(delim, currentIndex); - } - - return resultList.toArray(stringArrayType); - } - - public static String[] tokenizeCharAt(final String data) { - return tokenizeCharAt(data, DELIM); - } - - public static String[] tokenizeCharAt(final String data, final char delim) { - final List resultList = new ArrayList(); - - int i=0; - int length = data.length(); - while (i<=length) { - int start = i; - while (i, Iterator { - private final String data; - private final char delim; - private int nextIndex; - private int currentIndex = 0; - private String next; - private boolean lastRecord = false; - - public Iter(String data, char delim) { - this.data = data; - this.delim = delim; - readNext(); - } - - @Override - public Iterator iterator() { - return this; - } - - @Override - public boolean hasNext() { - return null != next; - } - - @Override - public String next() { - if ( ! hasNext()) - throw new NoSuchElementException(); - - String s = next; - readNext(); - return s; - } - - private void readNext() { - if (lastRecord) { - next = null; - return; - } - nextIndex = data.indexOf(delim, currentIndex); - if (nextIndex < 0) { - lastRecord = true; - next = data.substring(currentIndex); - } else { - next = data.substring(currentIndex, nextIndex); - currentIndex = nextIndex + 1; - } - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - } + + private static final char DELIM = '\t'; + private static final String[] stringArrayType = new String[]{}; + + public static String[] tokenize(final String data) { + return tokenize(data, DELIM); + } + + public static String[] tokenize(final String data, int requiredEntries) { + return tokenize(data, DELIM, requiredEntries); + } + + public static String[] tokenize(final String data, final char delim) { + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(); + + resultList.add(data.substring(currentIndex, nextIndex)); + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while (nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + // get last string + resultList.add(data.substring(currentIndex)); + + return resultList.toArray(stringArrayType); + } + + public static String[] tokenize(final String data, final char delim, final int requiredEntries) { + int noOfEntries = 0; + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(); + + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while ((noOfEntries <= requiredEntries) && nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + // get last string + resultList.add(data.substring(currentIndex)); + + return resultList.toArray(stringArrayType); + } + + public static String[] partialTokenize(final String data, final char delim, final int requiredEntries) { + int noOfEntries = 0; + int nextIndex = data.indexOf(delim); + if (nextIndex < 0) { + /* + * rather than throw an IllegalArgumentException, return an array with the data as the only element + */ + return new String[]{data}; + } + + int currentIndex = 0; + final List resultList = new ArrayList<>(requiredEntries + 1); + + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + + nextIndex = data.indexOf(delim, currentIndex); + while ((noOfEntries < requiredEntries) && nextIndex != -1) { + resultList.add(data.substring(currentIndex, nextIndex)); + noOfEntries++; + currentIndex = nextIndex + 1; + nextIndex = data.indexOf(delim, currentIndex); + } + + return resultList.toArray(stringArrayType); + } + + public static String[] tokenizeCharAt(final String data) { + return tokenizeCharAt(data, DELIM); + } + + public static String[] tokenizeCharAt(final String data, final char delim) { + final List resultList = new ArrayList<>(); + + int i = 0; + int length = data.length(); + while (i <= length) { + int start = i; + while (i < length && data.charAt(i) != delim) { + i++; + } + resultList.add(data.substring(start, i)); + // do something with the string here + i++; + } + return resultList.toArray(stringArrayType); + } + + static class Iter implements Iterable, Iterator { + private final String data; + private final char delim; + private int nextIndex; + private int currentIndex = 0; + private String next; + private boolean lastRecord = false; + + public Iter(String data, char delim) { + this.data = data; + this.delim = delim; + readNext(); + } + + @Override + public Iterator iterator() { + return this; + } + + @Override + public boolean hasNext() { + return null != next; + } + + @Override + public String next() { + if (!hasNext()) throw new NoSuchElementException(); + + String s = next; + readNext(); + return s; + } + + private void readNext() { + if (lastRecord) { + next = null; + return; + } + nextIndex = data.indexOf(delim, currentIndex); + if (nextIndex < 0) { + lastRecord = true; + next = data.substring(currentIndex); + } else { + next = data.substring(currentIndex, nextIndex); + currentIndex = nextIndex + 1; + } + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + } } diff --git a/qcommon/src/org/qcmg/common/vcf/VcfRecord.java b/qcommon/src/org/qcmg/common/vcf/VcfRecord.java index 2eef99854..694e61ee4 100644 --- a/qcommon/src/org/qcmg/common/vcf/VcfRecord.java +++ b/qcommon/src/org/qcmg/common/vcf/VcfRecord.java @@ -31,7 +31,7 @@ public class VcfRecord implements Comparable { static final QLogger logger = QLoggerFactory.getLogger(VcfRecord.class); - static final Comparator CHR_POS_COMPARATOR = ChrPositionComparator.getComparator(ChrPositionComparator.getChrNameComparator(null)); + static final Comparator CHR_POS_COMPARATOR = ChrPositionComparator.getComparator(ChrPositionComparator.getChrNameComparator((List)null)); private final ChrPosition cpp; private final String ref; @@ -266,7 +266,7 @@ public void setFormatFields(List field) { */ public VcfFormatFieldRecord getSampleFormatRecord(int index){ String s = (index >= formatRecords.size() || index == 0)? null: formatRecords.get(index); - return (s == null)? null : new VcfFormatFieldRecord(formatRecords.get(0), s); + return (s == null)? null : new VcfFormatFieldRecord(formatRecords.getFirst(), s); } /** @@ -276,7 +276,7 @@ public VcfFormatFieldRecord getSampleFormatRecord(int index){ */ public List getFormatFields() { // return a copy of this - if( formatRecords.size() == 0 ) return Collections.emptyList(); + if(formatRecords.isEmpty()) return Collections.emptyList(); return new ArrayList<>(formatRecords); } @@ -410,16 +410,15 @@ public int compareTo(VcfRecord arg0) { if (null != ref && null != arg0.ref) { diff = ref.compareTo(arg0.ref); - } else if (null != ref && null == arg0.ref) { + } else if (null != ref) { diff = -1; - } else if (null == ref && null != arg0.ref) { + } else if (null != arg0.ref) { diff = 1; } else { /* * both null */ - diff = 0; - } + } if (diff != 0) { return diff; @@ -430,16 +429,15 @@ public int compareTo(VcfRecord arg0) { */ if (null != alt && null != arg0.alt) { diff = alt.compareTo(arg0.alt); - } else if (null != alt && null == arg0.alt) { + } else if (null != alt) { diff = -1; - } else if (null == alt && null != arg0.alt) { + } else if (null != arg0.alt) { diff = 1; } else { /* * both null */ - diff = 0; - } + } if (diff != 0) { return diff; } diff --git a/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java b/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java index 147b6ce6a..91549a9a8 100644 --- a/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java +++ b/qcommon/test/org/qcmg/common/model/ChrPositionComparatorTest.java @@ -2,9 +2,7 @@ import static org.junit.Assert.*; -import java.util.Arrays; -import java.util.Comparator; -import java.util.List; +import java.util.*; import org.junit.Test; import org.qcmg.common.vcf.VcfRecord; @@ -38,7 +36,7 @@ public void vcfComp() { @Test public void vcfComp2() { - List contigs = Arrays.asList("chr1"); + List contigs = List.of("chr1"); Comparator c = ChrPositionComparator.getVcfRecordComparator(contigs); VcfRecord v1 = VcfUtils.createVcfRecord("chr1", 100); @@ -124,7 +122,11 @@ public void cpSortingReferenceAgnostic2() { @Test public void qsigComparatorTesting() { List contigOrder = Arrays.asList("chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chrX", "chrY", "GL000199.1", "GL000216.1", "chrMT"); - Comparator cpc = ChrPositionComparator.getChrNameComparator(contigOrder); + Map contigOrderMap = new HashMap<>(); + for (int i = 0; i < contigOrder.size(); i++) { + contigOrderMap.put(contigOrder.get(i), i); + } + Comparator cpc = ChrPositionComparator.getChrNameComparator(contigOrderMap); assertEquals(-1, cpc.compare("chr1", "chr2")); assertEquals(1, cpc.compare("chr2", "chr1")); @@ -135,5 +137,19 @@ public void qsigComparatorTesting() { assertEquals(1, cpc.compare("chrGL000216.1", "chrMT")); assertEquals(-22, cpc.compare("chr1", "chrMT")); } + + @Test + public void testShortcutComparator() { + Comparator cpc = ChrPositionComparator.getChrNameComparatorNoChrsOneToM(); + + assertEquals(-1, cpc.compare("1", "2")); + assertEquals(1, cpc.compare("2", "1")); + assertEquals(0, cpc.compare("2", "2")); + assertEquals(-1, cpc.compare("M", "GL000199.1")); + assertEquals(1, cpc.compare("GL000216.1", "M")); + assertEquals(-24, cpc.compare("1", "M")); + assertEquals(24, cpc.compare("M", "1")); + assertEquals(0, cpc.compare("M", "M")); + } } diff --git a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java index b0ed74539..4267a56a4 100644 --- a/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java +++ b/qcommon/test/org/qcmg/common/util/ChrPositionUtilsTest.java @@ -36,6 +36,17 @@ public void testDelta() { assertEquals(true, ChrPositionUtils.arePositionsWithinDelta(cp1, cp2, 4)); } + @Test + public void testConvertChrPositionToLong() { + long expected = ((long) 4 << 32) + 9; + long actual = ChrPositionUtils.convertContigAndPositionToLong("4", 9); + assertEquals(expected, actual); + + ChrPosition cp = ChrPositionUtils.convertLongToChrPosition(actual); + assertEquals("4", cp.getChromosome()); + assertEquals(9, cp.getStartPosition()); + + } @Test public void toVcfStringShouldReturnCorrectFormat() { ChrPosition cp = new ChrRangePosition("chr1", 1000, 2000); diff --git a/qio/src/org/qcmg/qio/record/RecordReader.java b/qio/src/org/qcmg/qio/record/RecordReader.java index 138db25d7..8a63c4011 100644 --- a/qio/src/org/qcmg/qio/record/RecordReader.java +++ b/qio/src/org/qcmg/qio/record/RecordReader.java @@ -1,7 +1,7 @@ /** * © Copyright The University of Queensland 2010-2014. * © Copyright QIMR Berghofer Medical Research Institute 2014-2016. - * + *

* This code is released under the terms outlined in the included LICENSE file. */ @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.List; import java.util.zip.GZIPInputStream; + import org.qcmg.common.util.FileUtils; @@ -29,117 +30,117 @@ public abstract class RecordReader implements Closeable, Iterable { public static final int DEFAULT_BUFFER_SIZE = 65536; public static final String DEFAULT_HEADER_PREFIX = null; //no header line public static final Charset DEFAULT_CHARSET = StandardCharsets.UTF_8; - + protected final File file; //only allow create once protected final BufferedReader bin; - protected T next; - + protected T next; + protected List headerLines = new ArrayList<>(); - - public RecordReader(final File file) throws IOException { - this(file, DEFAULT_BUFFER_SIZE); + + public RecordReader(final File file) throws IOException { + this(file, DEFAULT_BUFFER_SIZE); } - + public RecordReader(final File file, int bufferSize) throws IOException { - this(file, bufferSize, DEFAULT_HEADER_PREFIX, DEFAULT_CHARSET); - } - - public RecordReader(final File file, CharSequence headerPrefix) throws IOException { - this(file, DEFAULT_BUFFER_SIZE, headerPrefix, DEFAULT_CHARSET); + this(file, bufferSize, DEFAULT_HEADER_PREFIX, DEFAULT_CHARSET); + } + + public RecordReader(final File file, CharSequence headerPrefix) throws IOException { + this(file, DEFAULT_BUFFER_SIZE, headerPrefix, DEFAULT_CHARSET); } - + public RecordReader(final File file, int bufferSize, CharSequence headerPrefix, Charset charset) throws IOException { this.file = file; - boolean isGzip = FileUtils.isInputGZip( file); - InputStream inputStream = (isGzip) ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); + boolean isGzip = FileUtils.isInputGZip(file); + InputStream inputStream = (isGzip) ? new GZIPInputStream(new FileInputStream(file)) : new FileInputStream(file); InputStreamReader streamReader = new InputStreamReader(inputStream, charset); bin = new BufferedReader(streamReader, bufferSize); - - String nextLine = readHeaderAndReturnFirstNonHeaderLine(headerPrefix); - //get first record, set to null for empty file - next = nextLine == null ? null : getRecord(nextLine); - + + String nextLine = readHeaderAndReturnFirstNonHeaderLine(headerPrefix); + //get first record, set to null for empty file + next = nextLine == null ? null : getRecord(nextLine); + } + /** * this method is overridable in subclass, eg illumina file have different header pattern - * + * * @param headerPrefix * @return the first line just after header * @throws IOException */ - public String readHeaderAndReturnFirstNonHeaderLine(CharSequence headerPrefix ) throws IOException { - - String nextLine = bin.readLine(); - - //keep empty header and return first nonHeaderline - if (headerPrefix == null) return nextLine; - - //reader header, hence file pointer to first line after header - while ( nextLine != null && nextLine.startsWith(headerPrefix + "") ) { - headerLines.add(nextLine); - //reset current read line - nextLine = bin.readLine(); - } - - return nextLine; + public String readHeaderAndReturnFirstNonHeaderLine(CharSequence headerPrefix) throws IOException { + + String nextLine = bin.readLine(); + + //keep empty header and return first nonHeaderline + if (headerPrefix == null) return nextLine; + + //reader header, hence file pointer to first line after header + while (nextLine != null && nextLine.startsWith(headerPrefix + "")) { + headerLines.add(nextLine); + //reset current read line + nextLine = bin.readLine(); + } + + return nextLine; } - - /** - * This reader can maximum take Integer.max lines of file header. Please make other header if bigger than this. - * @return a list of header lines - */ - public List getHeader() { - return headerLines; + + /** + * This reader can maximum take Integer.MAX lines of file header. Please make other header if bigger than this. + * + * @return a list of header lines + */ + public List getHeader() { + return headerLines; } @Override /** * Here, BufferedReader.close() calls InputStreamReader.close(), which API told us that it Closes the stream and releases any system resources associated with it. - */ - public void close() throws IOException { - bin.close(); + */ public void close() throws IOException { + bin.close(); } public File getFile() { - return file; + return file; } - @Override - public Iterator iterator() { - Iterator iter = new Iterator() { + @Override + public Iterator iterator() { + + return new Iterator<>() { @Override - public boolean hasNext() { - return null != next; + public boolean hasNext() { + return null != next; } - - @Override - //return the stored record (next), even it is null - public T next() { - T rec = next; - next = null; //in case exception happen, same line repeatedly - - try { - //get next record, it may read multi lines - String line = bin.readLine(); - if ( line != null ) { - next = getRecord( line ); - } - - return rec; - } catch (IOException e) { - //here we only catch IO exception - throw new UncheckedIOException(e); - } + + @Override + //return the stored record (next), even it is null + public T next() { + T rec = next; + next = null; //in case exception happen, same line repeatedly + + try { + //get next record, it may read multi lines + String line = bin.readLine(); + if (line != null) { + next = getRecord(line); + } + + return rec; + } catch (IOException e) { + //here we only catch IO exception + throw new UncheckedIOException(e); + } } }; - - return iter; - } - - //some record cross multi lines, eg id\nseq\n, this method may call bin.readLine() inside - public abstract T getRecord(String line); + } + + //some record cross multi lines, eg id\nseq\n, this method may call bin.readLine() inside + public abstract T getRecord(String line); } diff --git a/qio/src/org/qcmg/qio/record/StringFileReader.java b/qio/src/org/qcmg/qio/record/StringFileReader.java index 3ffc467d4..1613cac10 100644 --- a/qio/src/org/qcmg/qio/record/StringFileReader.java +++ b/qio/src/org/qcmg/qio/record/StringFileReader.java @@ -29,10 +29,7 @@ public StringFileReader(final File file, int bufferSize, CharSequence headerPref @Override - /** - * return input self even it is null - */ - public String getRecord(String line) { + public String getRecord(String line) { return line; } } \ No newline at end of file diff --git a/qsignature/src/org/qcmg/sig/Generate.java b/qsignature/src/org/qcmg/sig/Generate.java index a6d44c65c..1c12fefb6 100644 --- a/qsignature/src/org/qcmg/sig/Generate.java +++ b/qsignature/src/org/qcmg/sig/Generate.java @@ -200,7 +200,11 @@ private void processBamFiles() throws IOException { * Set chrComparator and * order snps based on bam contig order */ - chrComparator = ChrPositionComparator.getChrNameComparator(bamContigs); + Map contigOrderMap = new LinkedHashMap<>(); + for (int i = 0; i < bamContigs.size(); i++) { + contigOrderMap.put(bamContigs.get(i), i); + } + chrComparator = ChrPositionComparator.getChrNameComparator(contigOrderMap); positionsIterator.sort(bamContigs); /* @@ -251,10 +255,6 @@ private void processIlluminaFiles() throws IOException { logger.info("got following details from illumina file:" + illuminaFile.getName()); logger.info("patient: " + patient + ", sample: " + sample + ", inputType: " + inputType); - if (null != inputType && inputType.length() == 4) { - inputType = inputType.substring(1, 3); - } - /* * load data from snp chip file into map */