diff --git a/phenomiser-cli/src/main/java/org/jax/cmd/PhenopacketCommand.java b/phenomiser-cli/src/main/java/org/jax/cmd/PhenopacketCommand.java new file mode 100644 index 0000000..03d93ef --- /dev/null +++ b/phenomiser-cli/src/main/java/org/jax/cmd/PhenopacketCommand.java @@ -0,0 +1,147 @@ +package org.jax.cmd; + +import com.beust.jcommander.Parameter; +import org.jax.Phenomiser; +import org.jax.io.DiseaseParser; +import org.jax.io.HpoParser; +import org.jax.io.PhenopacketImporter; +import org.jax.model.Item2PValueAndSimilarity; +import org.jax.services.AbstractResources; +import org.jax.services.CachedResources; +import org.jax.utils.DiseaseDB; +import org.json.simple.parser.ParseException; +import org.monarchinitiative.phenol.base.PhenolException; +import org.monarchinitiative.phenol.io.obo.hpo.HpoDiseaseAnnotationParser; +import org.monarchinitiative.phenol.ontology.data.TermId; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import javax.annotation.Nullable; +import java.io.*; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.stream.Collectors; + +public class PhenopacketCommand extends PhenomiserCommand { + + private static Logger logger = LoggerFactory.getLogger(QueryCommand.class); + final String HOME = System.getProperty("user.home"); + + @Parameter(names = {"-hpo", "--hpo_path"}, description = "specify the path to hp.obo") + private String hpoPath; + @Parameter(names = {"-da", "--disease_annotation"}, description = "specify the path to disease annotation file") + private String diseasePath; + @Parameter(names = {"-cachePath", "--cachePath"}, description = "specify the path to save precomputed data") + private String cachePath = HOME + File.separator + "Phenomiser_data"; + @Parameter(names = {"-db", "--diseaseDB"}, + description = "choose disease database [OMIM,ORPHA]") + private String diseaseDB = "OMIM"; + @Parameter(names = {"-pp", "--phenopacket"}, description = "specify the path to a phenopachet file") + private String phenopacket; + + @Parameter(names = {"-o", "--output"}, description = "specify output path") + private String outPath; + + private AbstractResources resources; + + + @Override + public void run() { + HpoParser hpoParser = new HpoParser(hpoPath); + hpoParser.init(); + HpoDiseaseAnnotationParser diseaseAnnotationParser = new HpoDiseaseAnnotationParser(diseasePath, hpoParser.getHpo()); + DiseaseParser diseaseParser = new DiseaseParser(diseaseAnnotationParser, hpoParser.getHpo()); + try { + diseaseParser.init(); + } catch (PhenolException e) { + e.printStackTrace(); + System.exit(1); + } + + if (!Files.exists(Paths.get(cachePath))){ + System.err.print("Cannot find caching data at " + cachePath); + System.exit(1); + } + + List queryList; + try { + PhenopacketImporter ppimporter = PhenopacketImporter.fromJson(phenopacket); + queryList = ppimporter.getHpoTerms(); + } catch (ParseException e) { + e.printStackTrace(); + return; + } catch (IOException e) { + e.printStackTrace(); + return; + } + + resources = new CachedResources(hpoParser, diseaseParser, cachePath, Math.min(queryList.size(), 10)); + resources.init(); + Phenomiser.setResources(resources); + + + List db = Arrays.stream(diseaseDB.split(",")).map(DiseaseDB::valueOf).collect(Collectors.toList()); + List> result = Phenomiser.query(queryList, db); + + //output query result + if (!result.isEmpty()) { + write_query_result(result, outPath); + } + } + + public static Writer getWriter(String path) { + Writer writer; + try { + writer = new FileWriter(new File(path)); + } catch (Exception e) { + logger.info("out path not found. writing to console: "); + writer = new OutputStreamWriter(System.out); + } + return writer; + } + + public void write_query_result(List> result, @Nullable String + outPath) { + + Writer writer = getWriter(outPath); + + try { + writer.write("diseaseId\tdiseaseName\tp\tadjust_p" + + "\tsimilarityScore" + + "\n"); + } catch (IOException e) { + logger.error("io exception during writing header. writing output aborted."); + return; + } + List> newList = new ArrayList<>(result); + Collections.sort(newList); + + newList.stream().forEach(e -> { + try { + writer.write(e.getItem().getValue()); + writer.write("\t"); + writer.write(resources.getDiseaseMap().get(e.getItem()).getName()); + writer.write("\t"); + writer.write(Double.toString(e.getRawPValue())); + writer.write("\t"); + writer.write(Double.toString(e.getAdjustedPValue())); + writer.write("\t"); + writer.write(Double.toString(e.getSimilarityScore())); + writer.write("\n"); + } catch (IOException exception) { + logger.error("IO exception during writing out adjusted p values"); + } + + }); + + try { + writer.close(); + } catch (IOException e) { + logger.error("IO exception during closing writer"); + } + } +} diff --git a/phenomiser-cli/src/main/java/org/jax/cmd/QueryCommand.java b/phenomiser-cli/src/main/java/org/jax/cmd/QueryCommand.java index f89e23e..1161654 100644 --- a/phenomiser-cli/src/main/java/org/jax/cmd/QueryCommand.java +++ b/phenomiser-cli/src/main/java/org/jax/cmd/QueryCommand.java @@ -12,7 +12,6 @@ import org.monarchinitiative.phenol.base.PhenolException; import org.monarchinitiative.phenol.io.obo.hpo.HpoDiseaseAnnotationParser; import org.monarchinitiative.phenol.ontology.data.TermId; -import org.monarchinitiative.phenol.stats.Item2PValue; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -95,10 +94,6 @@ public static Writer getWriter(String path) { public void write_query_result(List> result, @Nullable String outPath) { -// if (adjusted_p_value == null) { -// return; -// } - Writer writer = getWriter(outPath); try { diff --git a/phenomiser-core/src/main/java/org/jax/Phenomiser.java b/phenomiser-core/src/main/java/org/jax/Phenomiser.java index e4a1732..20da0d1 100644 --- a/phenomiser-core/src/main/java/org/jax/Phenomiser.java +++ b/phenomiser-core/src/main/java/org/jax/Phenomiser.java @@ -1,13 +1,12 @@ package org.jax; -import org.h2.mvstore.DataUtils; import org.jax.model.Item2PValueAndSimilarity; import org.jax.services.AbstractResources; +import org.jax.services.CachedResources; import org.jax.services.PValueCalculator; import org.jax.services.SimilarityScoreCalculator; import org.jax.utils.DiseaseDB; import org.jax.utils.Ranker; -import org.monarchinitiative.phenol.ontology.data.Term; import org.monarchinitiative.phenol.ontology.data.TermId; import org.monarchinitiative.phenol.stats.BenjaminiHochberg; import org.monarchinitiative.phenol.stats.Item2PValue; @@ -81,31 +80,8 @@ public static List> query(List queryTer return adjusted; } - /** - * Query in batch mode with multiple queries. This method optimizes resource usage to avoid repeated file io. - * @param queries a list of query list. - * @param dbs a list of disease databases - * @return a list of disease ranking lists - */ - public static List>> batchQuery(List> queries, List dbs) { - - Map listSizes = new HashMap<>(); // from first to last list, count how many Terms each list has - for (int i = 0; i < queries.size(); i++) { - listSizes.put(i, queries.get(i).size()); - } - - //process query lists in the order of how many terms they have - listSizes.values().forEach(listSize -> { - - - - }); - - throw new UnsupportedOperationException("TO implement"); - } - /** * Provide a list of query terms and a disease ID, find the rank of specified disease in the disease ranking * @param queryTerms @@ -129,6 +105,36 @@ public static int findRank(List queryTerms, TermId targetDisease, List>> batchQuery(List> queries, List dbs) { + + // from first to last list, count how many Terms each list has + Set termCounts = queries.stream().map(List::size).collect(Collectors.toSet()); + + List>> queryResults = new ArrayList<>(); + + //process query lists in the order of how many terms they have + termCounts.forEach(termCount -> { + if (resources instanceof CachedResources) { + ((CachedResources) resources).cleanAndLoadScoreDistribution(termCount); + } + + for (int i = 0; i < queries.size(); i++) { + if (queries.get(i).size() == termCount) { + List> queryResult = query(queries.get(i), dbs); + queryResults.add(i, queryResult); + } + } + }); + + return queryResults; + } + /** * Provide multiple query term lists. For each query list, provide a target disease in a separate list. Return the rank of specified disease for each query list. * @param queries @@ -138,7 +144,27 @@ public static int findRank(List queryTerms, TermId targetDisease, List> queries, List targetDiseases, List dbs){ - throw new UnsupportedOperationException("TO implement"); + + // from first to last list, count how many Terms each list has + Set termCounts = queries.stream().map(List::size).collect(Collectors.toSet()); + + int[] ranks = new int[queries.size()]; + + //process query lists in the order of how many terms they have + termCounts.forEach(termCount -> { + if (resources instanceof CachedResources) { + ((CachedResources) resources).cleanAndLoadScoreDistribution(termCount); + } + + for (int i = 0; i < queries.size(); i++) { + if (queries.get(i).size() == termCount) { + int rank = findRank(queries.get(i), targetDiseases.get(i), dbs); + ranks[i] = rank; + } + } + }); + + return ranks; } } diff --git a/phenomiser-core/src/main/java/org/jax/io/PhenopacketImporter.java b/phenomiser-core/src/main/java/org/jax/io/PhenopacketImporter.java new file mode 100644 index 0000000..85df235 --- /dev/null +++ b/phenomiser-core/src/main/java/org/jax/io/PhenopacketImporter.java @@ -0,0 +1,182 @@ +package org.jax.io; + +import com.google.common.collect.ImmutableList; +import com.google.protobuf.util.JsonFormat; +import org.json.simple.JSONObject; +import org.json.simple.parser.JSONParser; +import org.json.simple.parser.ParseException; +import org.monarchinitiative.phenol.ontology.data.Ontology; +import org.monarchinitiative.phenol.ontology.data.Term; +import org.monarchinitiative.phenol.ontology.data.TermId; +import org.phenopackets.schema.v1.Phenopacket; +import org.phenopackets.schema.v1.core.HtsFile; +import org.phenopackets.schema.v1.core.OntologyClass; +import org.phenopackets.schema.v1.core.Phenotype; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.FileReader; +import java.io.IOException; +import java.util.List; +import java.util.function.Predicate; + +/** + * This class ingests a phenopacket, which is required to additionally contain the + * path of a VCF file that will be used for the analysis. + * @author Peter Robinson + */ +public class PhenopacketImporter { + private static final Logger logger = LoggerFactory.getLogger(PhenopacketImporter.class); + /** The Phenopacket that represents the individual being sequenced in the current run. */ + private final Phenopacket phenoPacket; + /** A list of non-negated HPO terms observed in the subject of this Phenopacket. */ + private ImmutableList hpoTerms; + /** A list of negated HPO terms observed in the subject of this Phenopacket. */ + private ImmutableList negatedHpoTerms; + /** Path to the VCF file with variants identified in the subject of this Phenopacket. */ + private String vcfPath; + /** Genome assembly of the VCF file in {@link #vcfPath}. */ + private String genomeAssembly; + /** Name of the proband of the Phenopacket (corresponds to the {@code id} element of the phenopacket). */ + private final String samplename; + + + /** + * Factory method to obtain a PhenopacketImporter object starting from a phenopacket in Json format + * @param pathToJsonPhenopacketFile -- path to the phenopacket + * @return {@link PhenopacketImporter} object corresponding to the PhenoPacket + * @throws ParseException if the JSON code cannot be parsed + * @throws IOException if the File cannot be found + */ + public static PhenopacketImporter fromJson(String pathToJsonPhenopacketFile) throws ParseException,IOException { + JSONParser parser = new JSONParser(); + logger.trace("Importing Phenopacket: " + pathToJsonPhenopacketFile); + Object obj = parser.parse(new FileReader(pathToJsonPhenopacketFile)); + JSONObject jsonObject = (JSONObject) obj; + String phenopacketJsonString = jsonObject.toJSONString(); + Phenopacket phenopacket; + try { + Phenopacket.Builder phenoPacketBuilder = Phenopacket.newBuilder(); + JsonFormat.parser().merge(phenopacketJsonString, phenoPacketBuilder); + phenopacket = phenoPacketBuilder.build(); + } catch (IOException e1) { + e1.printStackTrace(); + throw new RuntimeException("Could not load phenopacket at " + pathToJsonPhenopacketFile); + } + return new PhenopacketImporter(phenopacket); + } + + public PhenopacketImporter(Phenopacket ppack){ + this.phenoPacket=ppack; + this.samplename = this.phenoPacket.getSubject().getId(); + extractProbandHpoTerms(); + extractNegatedProbandHpoTerms(); + extractVcfData(); + } + + public boolean hasVcf() { return this.vcfPath !=null; } + + public List getHpoTerms() { + return hpoTerms; + } + + public List getNegatedHpoTerms() { + return negatedHpoTerms; + } + + public String getVcfPath() { + return vcfPath; + } + + public String getGenomeAssembly() { + return genomeAssembly; + } + + public String getSamplename() { + return samplename; + } + + public boolean checkForObsoleteTerms(Ontology ontology) { + boolean clean=true; + for (TermId tid : hpoTerms) { + if (ontology.getObsoleteTermIds().contains(tid)) { + clean=false; + logger.error("Use of obsolete term id: {}",tid); + Term term = ontology.getTermMap().get(tid); + if (term==null) { + logger.error("Could not find TermObject."); + continue; + } + logger.error("The corresponding term label is {}",term.getName()); + logger.error("We recommend replacing the term id with the current id: {}", term.getId().getValue()); + } + } + for (TermId tid : negatedHpoTerms) { + if (ontology.getObsoleteTermIds().contains(tid)) { + clean=false; + logger.error("Use of obsolete term id: {}",tid); + Term term = ontology.getTermMap().get(tid); + if (term==null) { + logger.error("Could not find TermObject."); + continue; + } + logger.error("The corresponding term label is {}",term.getName()); + logger.error("We recommend replacing the term id with the current id: {}", term.getId().getValue()); + } + } + + return clean; + } + + + + /** + * This method extracts a list of + * all of the non-negated HPO terms that are annotated to the proband of this + * phenopacket. Note that we use "distinct" to get only distinct elements, defensively, + * even though a valid phenopacket should not have duplicates. + */ + private void extractProbandHpoTerms() { + this.hpoTerms= phenoPacket + .getPhenotypesList() + .stream() + .distinct() + .filter(((Predicate) Phenotype::getNegated).negate()) // i.e., just take non-negated phenotypes + .map(Phenotype::getType) + .map(OntologyClass::getId) + .map(TermId::of) + .collect(ImmutableList.toImmutableList()); + } + + /** + * This function gets a list of all negated HPO terms associated with the proband. + */ + private void extractNegatedProbandHpoTerms() { + this.negatedHpoTerms = phenoPacket + .getPhenotypesList() + .stream() + .filter(Phenotype::getNegated) // i.e., just take negated phenotypes + .map(Phenotype::getType) + .map(OntologyClass::getId) + .map(TermId::of) + .collect(ImmutableList.toImmutableList()); + } + + /** This method extracts the VCF file and the corresponding GenomeBuild. We assume that + * the phenopacket contains a single VCF file and that this file is for a single person. */ + private void extractVcfData() { + List htsFileList = phenoPacket.getHtsFilesList(); + if (htsFileList.size() > 1 ) { + logger.error("Warning: multiple HTsFiles associated with this phenopacket"); + logger.error("Warning: we will return the path to the first VCF file we find"); + } else if (htsFileList.isEmpty()) { + return; + } + for (HtsFile htsFile : htsFileList) { + if (htsFile.getHtsFormat().equals(HtsFile.HtsFormat.VCF)) { + this.vcfPath=htsFile.getFile().getPath(); + this.genomeAssembly=htsFile.getGenomeAssembly(); + } + } + } +} diff --git a/pom.xml b/pom.xml index 5f8ff31..25e1ee9 100644 --- a/pom.xml +++ b/pom.xml @@ -22,6 +22,7 @@ UTF-8 1.8 1.8 + 0.4.0 @@ -79,6 +80,19 @@ ${phenol.version} + + org.phenopackets + phenopacket-schema + ${phenopacket.version} + + + + + com.googlecode.json-simple + json-simple + 1.1.1 + + org.mockito