Skip to content

Commit

Permalink
Merge branch 'release/0.6.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
bill-baumgartner committed Feb 11, 2016
2 parents ddae2cd + bd9af86 commit 3c1001f
Show file tree
Hide file tree
Showing 77 changed files with 2,436 additions and 2,095 deletions.
226 changes: 136 additions & 90 deletions README.md

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion datasource-fileparsers/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<parent>
<artifactId>datasource</artifactId>
<groupId>edu.ucdenver.ccp</groupId>
<version>0.6</version>
<version>0.6.1</version>
</parent>
<artifactId>datasource-fileparsers</artifactId>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair;
import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader;
import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID;
import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
Expand All @@ -77,8 +78,8 @@
import edu.ucdenver.ccp.identifier.publication.PubMedID;

/**
* This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website:
* http://dip.doe-mbi.ucla.edu/dip/Main.cgi
* This class is used to parse DIPYYYMMDD files which can be downloaded from the
* DIP website: http://dip.doe-mbi.ucla.edu/dip/Main.cgi
*
* @author Bill Baumgartner
*
Expand Down Expand Up @@ -197,7 +198,8 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
DipInteractionType interactionType = MiOntologyIdTermPair.parseString(DipInteractionType.class,
interactionTypes[i]);
DipProcessingStatus processingStatus = getDipProcessingStatus(processingStatuses[i], line);
String firstAuthorName = null; // change if the first author column ever contains names
String firstAuthorName = null; // change if the first author column
// ever contains names
DipPublication publication = getDipPublication(firstAuthorName, pmids[i * 2], pmids[i * 2 + 1]);

experiments.add(new DipInteractionExperiment(publication, processingStatus, detectionMethod,
Expand All @@ -212,9 +214,9 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
* @param string
* @param string2
* @param pmids
* @return {@link DipPublication} from first author name and conversions of strings like
* "pubmed:9194558" and "pubmed:DIP-209S" into a {@link PubMedID} and a
* {@link DipPublicationId}
* @return {@link DipPublication} from first author name and conversions of
* strings like "pubmed:9194558" and "pubmed:DIP-209S" into a
* {@link PubMedID} and a {@link DipPublicationId}
*/
private DipPublication getDipPublication(String firstAuthorName, String pmidStr, String dipPubIdStr) {
PubMedID pmid;
Expand All @@ -230,7 +232,8 @@ private DipPublication getDipPublication(String firstAuthorName, String pmidStr,

/**
* @param string
* @return {@link DipProcessingStatus} parsed from a string such as: "dip:0002(small scale)"
* @return {@link DipProcessingStatus} parsed from a string such as:
* "dip:0002(small scale)"
*/
private DipProcessingStatus getDipProcessingStatus(String statusStr, String line) {
Pattern p = Pattern.compile("(dip:\\d+)\\((.*?)\\)");
Expand Down Expand Up @@ -258,8 +261,9 @@ private DipInteractor getInteractor(String interactorStr, String alternateIdsStr
}

/*
* The columns for alternate IDs and aliases are always set to "-". If this is no
* longer the case then an exception will be thrown and code changes required.
* The columns for alternate IDs and aliases are always set to
* "-". If this is no longer the case then an exception will be
* thrown and code changes required.
*/
Set<DipInteractorID> alternateIds = null;
if (!alternateIdsStr.trim().equals("-")) {
Expand Down Expand Up @@ -318,13 +322,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
return new DipInteractorID(idStr);
}
if (idStr.startsWith("refseq:")) {
try {
return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"));
// return new RefSeqID(StringUtil.removePrefix(idStr, "refseq:"));
} catch (IllegalArgumentException e) {
logger.warn("Invalid RefSeq identifier detected: " + idStr);
return null;
}
return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"), idStr);
}
if (idStr.startsWith("uniprotkb:")) {
if (idStr.contains(StringConstants.HYPHEN_MINUS)) {
Expand All @@ -333,8 +331,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
try {
return new UniProtID(StringUtil.removePrefix(idStr, "uniprotkb:"));
} catch (IllegalArgumentException e) {
logger.warn("Invalid UniProt identifier detected: " + idStr);
return null;
return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage());
}
}
throw new IllegalArgumentException("Unhandled identifier type: " + idStr);
Expand All @@ -347,16 +344,16 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
// * MI id
// * @return id if recognized; otherwise, null
// */
// private static MolecularInteractionOntologyTermID extractMiId(String inputStr) {
// private static MolecularInteractionOntologyTermID extractMiId(String
// inputStr) {
// Pattern methodIDPattern = Pattern.compile("(MI:\\d+),?\\(");
// Matcher m = methodIDPattern.matcher(inputStr);
// if (m.find()) {
// return new MolecularInteractionOntologyTermID(m.group(1));
// }
// logger.error("Unable to locate ExperimentalMethod MI ID in String: " + inputStr);
// logger.error("Unable to locate ExperimentalMethod MI ID in String: " +
// inputStr);
// return null;
// }



}
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,5 @@ private FtpHost() {
public static final String KEGG_GENEMAPTAB_PATH = "pub/kegg/pathway/organisms";

public static final String MGI_REPORTS_PATH = "pub/reports";

public static final String IREFWEB_HOST = "ftp.no.embnet.org";

}
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,9 @@
import edu.ucdenver.ccp.datasource.identifiers.DataSource;
import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver;
import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID;
import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugsProductDatabaseID;
import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.PfamID;
Expand All @@ -158,7 +160,6 @@
import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggCompoundID;
import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggDrugID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.MeshID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.GiNumberID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.snp.SnpRsId;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
import edu.ucdenver.ccp.datasource.identifiers.obo.ChebiOntologyID;
Expand Down Expand Up @@ -931,11 +932,17 @@ private Set<Category> initCategories(CategoryListType list) {
}
Set<Category> toReturn = new HashSet<Category>();
for (CategoryType p : list.getCategory()) {
MeshID meshId = null;
Set<MeshID> meshIds = new HashSet<MeshID>();
if (!p.getMeshId().trim().isEmpty()) {
meshId = new MeshID(p.getMeshId().trim());
String meshStr = p.getMeshId().trim();
meshStr = meshStr.replaceAll("\"", "");
meshStr = meshStr.replace("[", "");
meshStr = meshStr.replace("]", "");
for (String tok : meshStr.split(",")) {
meshIds.add(new MeshID(tok));
}
}
Category c = new Category(meshId, p.getCategory());
Category c = new Category(meshIds, p.getCategory());
toReturn.add(c);
}
return toReturn;
Expand All @@ -945,7 +952,7 @@ private Set<Category> initCategories(CategoryListType list) {
@Record(dataSource = DataSource.DRUGBANK)
private static class Category {
@RecordField
private final MeshID meshId;
private final Set<MeshID> meshIds;
@RecordField
private final String category;
}
Expand Down Expand Up @@ -1297,22 +1304,18 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
} else if (resource.equals("GeneCards")) {
return new GeneCardId(identifier);
} else if (resource.equals("GenBank Gene Database")) {
return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
return NucleotideAccessionResolver.resolveNucleotideAccession(identifier, "GenBank Gene Database:"
+ identifier);
} else if (resource.equals("GenBank Protein Database")) {
try {
return ProteinAccessionResolver.resolveProteinAccession(identifier);
} catch (IllegalArgumentException e) {
if (identifier.matches("\\d+")) {
return new GiNumberID(identifier);
}
logger.warn("Observed invalid GenBank protein identifier: " + identifier);
return null;
}
return ProteinAccessionResolver
.resolveProteinAccession(identifier, "GenBank Protein Database" + identifier);
} else if (resource.equals("GenBank")) {
try {
return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
} catch (IllegalArgumentException e) {
return ProteinAccessionResolver.resolveProteinAccession(identifier);
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier,
"GenBank:" + identifier);
if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) {
return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier);
} else {
return nucAccId;
}
} else if (resource.equals("UniProtKB")) {
return new UniProtID(identifier);
Expand Down Expand Up @@ -1354,15 +1357,15 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
id = new UniProtID(identifier);
} catch (IllegalArgumentException e) {
logger.warn("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")");
return null;
return new UnknownDataSourceIdentifier(identifier, resource);
}
if (id != null) {
return id;
}
}

System.out.println("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")");
return null;
return new UnknownDataSourceIdentifier(identifier, resource);
// throw new IllegalArgumentException("Unhandled identifier type: " +
// resource +
// " (identifier=" + identifier
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ protected abstract T invokeConstructor(E idLineContents, List<D> accessionNumber
private EmblAssemblyInformation parseASLine(String line) {
String[] toks = line.split("\\s+");
String localSpan = toks[1];
DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2]);
DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2], toks[2]);
String primarySpan = toks[3];
boolean originatesFromComplementary = (toks.length == 5 && toks[4].trim().equalsIgnoreCase("c")) ? true : false;
return new EmblAssemblyInformation(localSpan, primaryIdentifier, primarySpan, originatesFromComplementary);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@
import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtIsoformID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
import edu.ucdenver.ccp.datasource.identifiers.obo.GeneOntologyID;
import edu.ucdenver.ccp.datasource.identifiers.other.RnaCentralId;
import edu.ucdenver.ccp.datasource.identifiers.reactome.ReactomeReactionID;
import edu.ucdenver.ccp.identifier.publication.DOI;
import edu.ucdenver.ccp.identifier.publication.PubMedID;
Expand Down Expand Up @@ -317,6 +318,9 @@ private static DataSourceIdentifier<?> createDatabaseObjectID(String database, S
if (database.equals("IntAct")) {
return new IntActID(databaseObjectIDStr);
}
if (database.equals("RNAcentral")) {
return new RnaCentralId(databaseObjectIDStr);
}
} catch (IllegalArgumentException e) {
logger.warn(e.getMessage());
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,6 @@ protected boolean hasTaxonOfInterest(SparseUniProtFileRecord record) {
}
for (DbReference dbRef : record.getOrganism().getDbReference()) {
if (getTaxonsOfInterest().contains(dbRef.getId())) {
System.out.println("has taxon of interest: " + dbRef.getId());
return true;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@
import edu.ucdenver.ccp.datasource.fileparsers.RecordField;
import edu.ucdenver.ccp.datasource.identifiers.DataSource;
import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID;
import edu.ucdenver.ccp.datasource.identifiers.ebi.embl.EmblID;
Expand Down Expand Up @@ -899,14 +901,10 @@ private DataSourceIdentifier<?> resolveDatabaseIdentifer(String type, String idS
return new PirnrId(idStr);
}
} catch (IllegalArgumentException e) {
logger.warn("Invalid identifier detected: " + e.getMessage());
return null;
return new ProbableErrorDataSourceIdentifier(idStr, type, e.getMessage());
}

// throw new IllegalArgumentException("Unhandled identifier type: "
// + type + " :: " + idStr);
logger.warn("Unhandled identifier type: " + type + " :: " + idStr);
return null;
return new UnknownDataSourceIdentifier(idStr, type);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,22 +73,24 @@
*
* @author Bill Baumgartner
*
* ID _________ Association(Y/N) _________ Broad Phenotype Disease Class _________ Disease
* Class Code _________ MeSH Disease Terms _________ Chromosom _________ Chr-Band _________
* _________ Gene _________ DNA Start _________ DNA End P Value Reference _________ Pubmed
* ID _________ Allele Author Description _________ Allele Functional Effects _________
* Polymophism Class _________ Gene Name _________ RefSeq _________ Population _________
* MeSH Geolocation _________ Submitter _________ Locus Number _________ Unigene _________
* Narrow Phenotype _________ Mole. Phenotype Journal Title _________ rs Number _________
* OMIM ID Year _________ Conclusion _________ Study Info _________ Env. Factor _________ GI
* Gene A _________ GI Allele of Gene A _________ GI Gene B _________ GI Allele of Gene B
* _________ GI Gene C _________ GI Allele of Gene C _________ GI Association? GI combine
* ID _________ Association(Y/N) _________ Broad Phenotype Disease Class
* _________ Disease Class Code _________ MeSH Disease Terms _________
* Chromosom _________ Chr-Band _________ _________ Gene _________ DNA
* Start _________ DNA End P Value Reference _________ Pubmed ID
* _________ Allele Author Description _________ Allele Functional
* Effects _________ Polymophism Class _________ Gene Name _________
* RefSeq _________ Population _________ MeSH Geolocation _________
* Submitter _________ Locus Number _________ Unigene _________ Narrow
* Phenotype _________ Mole. Phenotype Journal Title _________ rs Number
* _________ OMIM ID Year _________ Conclusion _________ Study Info
* _________ Env. Factor _________ GI Gene A _________ GI Allele of Gene
* A _________ GI Gene B _________ GI Allele of Gene B _________ GI Gene
* C _________ GI Allele of Gene C _________ GI Association? GI combine
* Env. Factor _________ GI relevant to Disease
*/

@Record(dataSource = DataSource.GAD, schemaVersion="2", comment="Schema version is 2 b/c one field was dropped: GAD/CDC", label="GAD record")
@Record(dataSource = DataSource.GAD, schemaVersion = "2", comment = "Schema version is 2 b/c one field was dropped: GAD/CDC", label = "GAD record")
public class GeneticAssociationDbAllTxtFileData extends SingleLineFileRecord {


private static final Logger logger = Logger.getLogger(GeneticAssociationDbAllTxtFileData.class);

Expand Down Expand Up @@ -422,7 +424,7 @@ public boolean hasAssociation() {
}

public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxtLine(Line line) {
String[] toks = line.getText().split("\\t",-1);
String[] toks = line.getText().split("\\t", -1);
if (toks.length < 23) {
logger.warn("Invalid line detected (" + line.getLineNumber() + "): " + line.getText());
}
Expand Down Expand Up @@ -454,9 +456,10 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
String geneName = toks[17];
String refseqURL = null;
try {
refseqURL = toks[18];
refseqURL = toks[18];
} catch (ArrayIndexOutOfBoundsException e) {
logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length+" Message: " + e.getMessage() + " LINE: " + line.getText());
logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length + " Message: "
+ e.getMessage() + " LINE: " + line.getText());
}

DataSourceIdentifier<?> nucleotideId = null;
Expand All @@ -470,13 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
if (acc.matches("\\d+")) {
nucleotideId = new GiNumberID(acc);
} else {
try {
nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
} catch (IllegalArgumentException e) {
logger.info("tok: " + refseqURL + ";");
logger.warn(e.getMessage());
nucleotideId = null;
}
nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, refseqURL);
}
}

Expand Down
Loading

0 comments on commit 3c1001f

Please sign in to comment.