Skip to content

Commit

Permalink
analysis: improve PLINK analysis (variant command line), #126
Browse files Browse the repository at this point in the history
  • Loading branch information
jtarraga committed Jun 13, 2017
1 parent dd07a79 commit d633e7c
Show file tree
Hide file tree
Showing 11 changed files with 705 additions and 18 deletions.
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
package org.opencb.hpg.bigdata.analysis;

import org.apache.spark.sql.SparkSession;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* Created by jtarraga on 30/01/17.
*/
public abstract class AnalysisExecutor {

protected static Logger logger = LoggerFactory.getLogger(AnalysisExecutor.class);

public static String metadataExtension = ".meta.json";

protected String datasetName;
protected SparkSession sparkSession;

public abstract void execute() throws AnalysisExecutorException;
protected abstract void execute() throws AnalysisExecutorException;

public String getDatasetName() {
return datasetName;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
package org.opencb.hpg.bigdata.analysis.variant;

import org.apache.spark.sql.Row;
import org.opencb.hpg.bigdata.analysis.AnalysisExecutor;
import org.opencb.hpg.bigdata.core.lib.VariantDataset;

import java.nio.file.Path;
import java.util.List;
import java.util.Map;

/**
* Created by jtarraga on 30/05/17.
Expand All @@ -9,6 +15,41 @@ public abstract class VariantAnalysisExecutor extends AnalysisExecutor {

protected String studyName;

protected VariantDataset filter(Path path, Map<String, String> filterOptions) throws Exception {
VariantDataset vd = new VariantDataset(sparkSession);
vd.load(path.toString());
vd.createOrReplaceTempView("vcf");

StringBuilder sb = new StringBuilder();
sb.append("SELECT id, chromosome, start, study.samplesData FROM vcf ");
sb.append("LATERAL VIEW explode(studies) act as study ");
//sb.append("LATERAL VIEW explode(study.samplesData[0]) act as samplesData ");
sb.append("WHERE study.studyId = '" + datasetName + "' ");

return vd;
}

protected List<Row> getRows(Path path, VariantFilterOptions filterOptions) throws Exception {
VariantDataset vd = new VariantDataset(sparkSession);

vd.load(path.toString());
vd.createOrReplaceTempView("vcf");
VariantAnalysisUtils.addVariantFilters(filterOptions, vd);

String sql = vd.getSql().replace(" * ", " id, chromosome, start, study.samplesData ")
.replace(" vcf ", " vcf LATERAL VIEW explode(studies) act as study ");
System.out.println(">>>>>> SQL = " + sql);
return vd.sqlContext().sql(sql).collectAsList();
//Dataset<Row> dr = vd.sqlContext().sql(vd.getSql());

//StringBuilder sb = new StringBuilder();
//sb.append("SELECT id, chromosome, start, study.samplesData FROM vcf2 ");
//sb.append("LATERAL VIEW explode(studies) act as study ");
//dr.createOrReplaceTempView("vcf2");

//return dr.sqlContext().sql(sb.toString()).collectAsList();
}

public String getStudyName() {
return studyName;
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
package org.opencb.hpg.bigdata.analysis.variant;

import org.opencb.hpg.bigdata.core.lib.VariantDataset;

import java.io.IOException;
import java.util.List;

/**
* Created by jtarraga on 13/06/17.
*/
public class VariantAnalysisUtils {

/**
* Add variant fitlers to the target variant dataset.
*
* @param filterOptions Filters to apply
* @param vd Target variant dataset
* @throws IOException Exception
*/
public static void addVariantFilters(VariantFilterOptions filterOptions,
VariantDataset vd) throws IOException {
// ID list
if (validList(filterOptions.getIdList())) {
vd.idFilter(filterOptions.getIdList(), false);
}

// type
if (validList(filterOptions.getTypeList())) {
vd.typeFilter(filterOptions.getTypeList());
}

// query for biotype
if (validList(filterOptions.getBiotypeList())) {
vd.annotationFilter("biotype", filterOptions.getBiotypeList());
}

// query for study
if (validList(filterOptions.getStudyList())) {
vd.studyFilter("studyId", filterOptions.getStudyList());
}

// query for maf (study:cohort)
if (validList(filterOptions.getMafList())) {
vd.studyFilter("stats.maf", filterOptions.getMafList());
}

// query for mgf (study:cohort)
if (validList(filterOptions.getMgfList())) {
vd.studyFilter("stats.mgf", filterOptions.getMgfList());
}

// query for region
if (validList(filterOptions.getRegionList())) {
vd.regionFilter(filterOptions.getRegionList());
}

// query for consequence type (Sequence Ontology term names and accession codes)
if (validList(filterOptions.getConsequenceTypeList())) {
vd.annotationFilter("consequenceTypes.sequenceOntologyTerms", filterOptions.getConsequenceTypeList());
}

// query for consequence type (gene names)
if (validList(filterOptions.getGeneList())) {
vd.annotationFilter("consequenceTypes.geneName", filterOptions.getGeneList());
}

// query for clinvar (accession)
if (validList(filterOptions.getClinvarList())) {
vd.annotationFilter("variantTraitAssociation.clinvar.accession", filterOptions.getClinvarList());
}

// query for cosmic (mutation ID)
if (validList(filterOptions.getCosmicList())) {
vd.annotationFilter("variantTraitAssociation.cosmic.mutationId", filterOptions.getCosmicList());
}

// query for conservation (phastCons, phylop, gerp)
if (validList(filterOptions.getConservScoreList())) {
vd.annotationFilter("conservation", filterOptions.getConservScoreList());
}

// query for protein substitution scores (polyphen, sift)
if (validList(filterOptions.getSubstScoreList())) {
vd.annotationFilter("consequenceTypes.proteinVariantAnnotation.substitutionScores", filterOptions.getSubstScoreList());
}

// query for alternate population frequency (study:population)
if (validList(filterOptions.getPfList())) {
vd.annotationFilter("populationFrequencies.altAlleleFreq", filterOptions.getPfList());
}

// query for population minor allele frequency (study:population)
if (validList(filterOptions.getPmafList())) {
vd.annotationFilter("populationFrequencies.refAlleleFreq", filterOptions.getPmafList());
}

// query for sample genotypes
// query for number of missing alleles (study:cohort)
// query for number of missing genotypes (study:cohort)
}

/**
* Sanity check.
*
* @param list list to check
* @return Boolean
*/
private static boolean validList(List list) {
return (list != null && list.size() > 0);
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,186 @@
package org.opencb.hpg.bigdata.analysis.variant;

import org.opencb.biodata.models.core.Region;

import java.util.List;

/**
* Created by jtarraga on 13/06/17.
*/
public class VariantFilterOptions {

// Filter ID
private List<String> idList;

// Filter type
private List<String> typeList;

// Filter study
private List<String> studyList;

// Filter biotype
private List<String> biotypeList;

// Filter regions
private List<Region> regionList;

// Filter Minor Allele Frequency (maf), study_name::cohort_name[<|>|<=|>=|==|!=]value
// e.g.: 1000g::all>0.4
private List<String> mafList;

// Filter Minor Genotype Frequency (mgf), study_name::cohort_name[<|>|<=|>=|==|!=]value
// e.g.: 1000g::all>0.18198
private List<String> mgfList;

// Filter consequence type, Sequence Ontology term names or accession codes
// e.g.: transgenic insertion,SO:32234,SO:00124
private List<String> consequenceTypeList;

// Filter gene
// e.g.: BIN3,ZNF517
private List<String> geneList;

// Filter clinvar
private List<String> clinvarList;

// Filter cosmic
private List<String> cosmicList;

// Filter conservation scores (phastCons, phylop, gerp)
// e.g.: phylop<0.3,phastCons<0.1
private List<String> conservScoreList;

// Filter protein substitution scores
// e.g.: polyphen>0.3,sift>0.6
private List<String> substScoreList;

// Filter alternate population frequency of a given study, study_name::population_name[<|>|<=|>=|==|!=]frequency_value
// e.g.: 1000g::CEU<0.4
private List<String> pfList;

// Filter population minor allele frequency of a given study: study_name:: population_name[<|>|<=|>=|==|!=]frequency_value
// e.g.: 1000g::PJL<=0.25
private List<String> pmafList;

//public String samples;

public List<String> getIdList() {
return idList;
}

public void setIdList(List<String> idList) {
this.idList = idList;
}

public List<String> getTypeList() {
return typeList;
}

public void setTypeList(List<String> typeList) {
this.typeList = typeList;
}

public List<String> getStudyList() {
return studyList;
}

public void setStudyList(List<String> studyList) {
this.studyList = studyList;
}

public List<String> getBiotypeList() {
return biotypeList;
}

public void setBiotypeList(List<String> biotypeList) {
this.biotypeList = biotypeList;
}

public List<Region> getRegionList() {
return regionList;
}

public void setRegionList(List<Region> regionList) {
this.regionList = regionList;
}

public List<String> getMafList() {
return mafList;
}

public void setMafList(List<String> mafList) {
this.mafList = mafList;
}

public List<String> getMgfList() {
return mgfList;
}

public void setMgfList(List<String> mgfList) {
this.mgfList = mgfList;
}

public List<String> getConsequenceTypeList() {
return consequenceTypeList;
}

public void setConsequenceTypeList(List<String> consequenceTypeList) {
this.consequenceTypeList = consequenceTypeList;
}

public List<String> getGeneList() {
return geneList;
}

public void setGeneList(List<String> geneList) {
this.geneList = geneList;
}

public List<String> getClinvarList() {
return clinvarList;
}

public void setClinvarList(List<String> clinvarList) {
this.clinvarList = clinvarList;
}

public List<String> getCosmicList() {
return cosmicList;
}

public void setCosmicList(List<String> cosmicList) {
this.cosmicList = cosmicList;
}

public List<String> getConservScoreList() {
return conservScoreList;
}

public void setConservScoreList(List<String> conservScoreList) {
this.conservScoreList = conservScoreList;
}

public List<String> getSubstScoreList() {
return substScoreList;
}

public void setSubstScoreList(List<String> substScoreList) {
this.substScoreList = substScoreList;
}

public List<String> getPfList() {
return pfList;
}

public void setPfList(List<String> pfList) {
this.pfList = pfList;
}

public List<String> getPmafList() {
return pmafList;
}

public void setPmafList(List<String> pmafList) {
this.pmafList = pmafList;
}
}
Loading

0 comments on commit d633e7c

Please sign in to comment.