Merge branch 'release/0.6.1'

ekwhite · Feb 11, 2016 · 3c1001f · 3c1001f
2 parents ddae2cd + bd9af86
commit 3c1001f
Show file tree

Hide file tree

Showing 77 changed files with 2,436 additions and 2,095 deletions.
diff --git a/README.md b/README.md
diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml
@@ -3,7 +3,7 @@
 	<parent>
 		<artifactId>datasource</artifactId>
 		<groupId>edu.ucdenver.ccp</groupId>
-		<version>0.6</version>
+		<version>0.6.1</version>
 	</parent>
 	<artifactId>datasource-fileparsers</artifactId>
 

diff --git a/...sers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java b/...sers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java
@@ -68,6 +68,7 @@
 import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair;
 import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader;
 import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
+import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
 import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID;
 import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
@@ -77,8 +78,8 @@
 import edu.ucdenver.ccp.identifier.publication.PubMedID;
 
 /**
- * This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website:
- * http://dip.doe-mbi.ucla.edu/dip/Main.cgi
+ * This class is used to parse DIPYYYMMDD files which can be downloaded from the
+ * DIP website: http://dip.doe-mbi.ucla.edu/dip/Main.cgi
  * 
  * @author Bill Baumgartner
  * 
@@ -197,7 +198,8 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
 			DipInteractionType interactionType = MiOntologyIdTermPair.parseString(DipInteractionType.class,
 					interactionTypes[i]);
 			DipProcessingStatus processingStatus = getDipProcessingStatus(processingStatuses[i], line);
-			String firstAuthorName = null; // change if the first author column ever contains names
+			String firstAuthorName = null; // change if the first author column
+											// ever contains names
 			DipPublication publication = getDipPublication(firstAuthorName, pmids[i * 2], pmids[i * 2 + 1]);
 
 			experiments.add(new DipInteractionExperiment(publication, processingStatus, detectionMethod,
@@ -212,9 +214,9 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
 	 * @param string
 	 * @param string2
 	 * @param pmids
-	 * @return {@link DipPublication} from first author name and conversions of strings like
-	 *         "pubmed:9194558" and "pubmed:DIP-209S" into a {@link PubMedID} and a
-	 *         {@link DipPublicationId}
+	 * @return {@link DipPublication} from first author name and conversions of
+	 *         strings like "pubmed:9194558" and "pubmed:DIP-209S" into a
+	 *         {@link PubMedID} and a {@link DipPublicationId}
 	 */
 	private DipPublication getDipPublication(String firstAuthorName, String pmidStr, String dipPubIdStr) {
 		PubMedID pmid;
@@ -230,7 +232,8 @@ private DipPublication getDipPublication(String firstAuthorName, String pmidStr,
 
 	/**
 	 * @param string
-	 * @return {@link DipProcessingStatus} parsed from a string such as: "dip:0002(small scale)"
+	 * @return {@link DipProcessingStatus} parsed from a string such as:
+	 *         "dip:0002(small scale)"
 	 */
 	private DipProcessingStatus getDipProcessingStatus(String statusStr, String line) {
 		Pattern p = Pattern.compile("(dip:\\d+)\\((.*?)\\)");
@@ -258,8 +261,9 @@ private DipInteractor getInteractor(String interactorStr, String alternateIdsStr
 				}
 
 				/*
-				 * The columns for alternate IDs and aliases are always set to "-". If this is no
-				 * longer the case then an exception will be thrown and code changes required.
+				 * The columns for alternate IDs and aliases are always set to
+				 * "-". If this is no longer the case then an exception will be
+				 * thrown and code changes required.
 				 */
 				Set<DipInteractorID> alternateIds = null;
 				if (!alternateIdsStr.trim().equals("-")) {
@@ -318,13 +322,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 			return new DipInteractorID(idStr);
 		}
 		if (idStr.startsWith("refseq:")) {
-			try {
-				return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"));
-				// return new RefSeqID(StringUtil.removePrefix(idStr, "refseq:"));
-			} catch (IllegalArgumentException e) {
-				logger.warn("Invalid RefSeq identifier detected: " + idStr);
-				return null;
-			}
+			return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"), idStr);
 		}
 		if (idStr.startsWith("uniprotkb:")) {
 			if (idStr.contains(StringConstants.HYPHEN_MINUS)) {
@@ -333,8 +331,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 			try {
 				return new UniProtID(StringUtil.removePrefix(idStr, "uniprotkb:"));
 			} catch (IllegalArgumentException e) {
-				logger.warn("Invalid UniProt identifier detected: " + idStr);
-				return null;
+				return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage());
 			}
 		}
 		throw new IllegalArgumentException("Unhandled identifier type: " + idStr);
@@ -347,16 +344,16 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 	// * MI id
 	// * @return id if recognized; otherwise, null
 	// */
-	// private static MolecularInteractionOntologyTermID extractMiId(String inputStr) {
+	// private static MolecularInteractionOntologyTermID extractMiId(String
+	// inputStr) {
 	// Pattern methodIDPattern = Pattern.compile("(MI:\\d+),?\\(");
 	// Matcher m = methodIDPattern.matcher(inputStr);
 	// if (m.find()) {
 	// return new MolecularInteractionOntologyTermID(m.group(1));
 	// }
-	// logger.error("Unable to locate ExperimentalMethod MI ID in String: " + inputStr);
+	// logger.error("Unable to locate ExperimentalMethod MI ID in String: " +
+	// inputStr);
 	// return null;
 	// }
 
-
-
 }
diff --git a/...e-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java b/...e-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java
@@ -80,7 +80,5 @@ private FtpHost() {
 	public static final String KEGG_GENEMAPTAB_PATH = "pub/kegg/pathway/organisms";
 
 	public static final String MGI_REPORTS_PATH = "pub/reports";
-
-	public static final String IREFWEB_HOST = "ftp.no.embnet.org";
 
 }
diff --git a/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java b/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java
@@ -147,7 +147,9 @@
 import edu.ucdenver.ccp.datasource.identifiers.DataSource;
 import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver;
+import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
+import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID;
 import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugsProductDatabaseID;
 import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.PfamID;
@@ -158,7 +160,6 @@
 import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggCompoundID;
 import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggDrugID;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.MeshID;
-import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.GiNumberID;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.snp.SnpRsId;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
 import edu.ucdenver.ccp.datasource.identifiers.obo.ChebiOntologyID;
@@ -931,11 +932,17 @@ private Set<Category> initCategories(CategoryListType list) {
 		}
 		Set<Category> toReturn = new HashSet<Category>();
 		for (CategoryType p : list.getCategory()) {
-			MeshID meshId = null;
+			Set<MeshID> meshIds = new HashSet<MeshID>();
 			if (!p.getMeshId().trim().isEmpty()) {
-				meshId = new MeshID(p.getMeshId().trim());
+				String meshStr = p.getMeshId().trim();
+				meshStr = meshStr.replaceAll("\"", "");
+				meshStr = meshStr.replace("[", "");
+				meshStr = meshStr.replace("]", "");
+				for (String tok : meshStr.split(",")) {
+					meshIds.add(new MeshID(tok));
+				}
 			}
-			Category c = new Category(meshId, p.getCategory());
+			Category c = new Category(meshIds, p.getCategory());
 			toReturn.add(c);
 		}
 		return toReturn;
@@ -945,7 +952,7 @@ private Set<Category> initCategories(CategoryListType list) {
 	@Record(dataSource = DataSource.DRUGBANK)
 	private static class Category {
 		@RecordField
-		private final MeshID meshId;
+		private final Set<MeshID> meshIds;
 		@RecordField
 		private final String category;
 	}
@@ -1297,22 +1304,18 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
 		} else if (resource.equals("GeneCards")) {
 			return new GeneCardId(identifier);
 		} else if (resource.equals("GenBank Gene Database")) {
-			return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
+			return NucleotideAccessionResolver.resolveNucleotideAccession(identifier, "GenBank Gene Database:"
+					+ identifier);
 		} else if (resource.equals("GenBank Protein Database")) {
-			try {
-				return ProteinAccessionResolver.resolveProteinAccession(identifier);
-			} catch (IllegalArgumentException e) {
-				if (identifier.matches("\\d+")) {
-					return new GiNumberID(identifier);
-				}
-				logger.warn("Observed invalid GenBank protein identifier: " + identifier);
-				return null;
-			}
+			return ProteinAccessionResolver
+					.resolveProteinAccession(identifier, "GenBank Protein Database" + identifier);
 		} else if (resource.equals("GenBank")) {
-			try {
-				return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
-			} catch (IllegalArgumentException e) {
-				return ProteinAccessionResolver.resolveProteinAccession(identifier);
+			DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier,
+					"GenBank:" + identifier);
+			if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) {
+				return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier);
+			} else {
+				return nucAccId;
 			}
 		} else if (resource.equals("UniProtKB")) {
 			return new UniProtID(identifier);
@@ -1354,15 +1357,15 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
 				id = new UniProtID(identifier);
 			} catch (IllegalArgumentException e) {
 				logger.warn("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")");
-				return null;
+				return new UnknownDataSourceIdentifier(identifier, resource);
 			}
 			if (id != null) {
 				return id;
 			}
 		}
 
 		System.out.println("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")");
-		return null;
+		return new UnknownDataSourceIdentifier(identifier, resource);
 		// throw new IllegalArgumentException("Unhandled identifier type: " +
 		// resource +
 		// " (identifier=" + identifier

diff --git a/.../edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java b/.../edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java
@@ -500,7 +500,7 @@ protected abstract T invokeConstructor(E idLineContents, List<D> accessionNumber
 	private EmblAssemblyInformation parseASLine(String line) {
 		String[] toks = line.split("\\s+");
 		String localSpan = toks[1];
-		DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2]);
+		DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2], toks[2]);
 		String primarySpan = toks[3];
 		boolean originatesFromComplementary = (toks.length == 5 && toks[4].trim().equalsIgnoreCase("c")) ? true : false;
 		return new EmblAssemblyInformation(localSpan, primaryIdentifier, primarySpan, originatesFromComplementary);

diff --git a/...va/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java b/...va/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java
@@ -63,6 +63,7 @@
 import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtIsoformID;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
 import edu.ucdenver.ccp.datasource.identifiers.obo.GeneOntologyID;
+import edu.ucdenver.ccp.datasource.identifiers.other.RnaCentralId;
 import edu.ucdenver.ccp.datasource.identifiers.reactome.ReactomeReactionID;
 import edu.ucdenver.ccp.identifier.publication.DOI;
 import edu.ucdenver.ccp.identifier.publication.PubMedID;
@@ -317,6 +318,9 @@ private static DataSourceIdentifier<?> createDatabaseObjectID(String database, S
 			if (database.equals("IntAct")) {
 				return new IntActID(databaseObjectIDStr);
 			}
+			if (database.equals("RNAcentral")) {
+				return new RnaCentralId(databaseObjectIDStr);
+			}
 		} catch (IllegalArgumentException e) {
 			logger.warn(e.getMessage());
 		}

diff --git a/...edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java b/...edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java
@@ -97,7 +97,6 @@ protected boolean hasTaxonOfInterest(SparseUniProtFileRecord record) {
 		}
 		for (DbReference dbRef : record.getOrganism().getDbReference()) {
 			if (getTaxonsOfInterest().contains(dbRef.getId())) {
-				System.out.println("has taxon of interest: " + dbRef.getId());
 				return true;
 			}
 		}

diff --git a/.../src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java b/.../src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java
@@ -80,6 +80,8 @@
 import edu.ucdenver.ccp.datasource.fileparsers.RecordField;
 import edu.ucdenver.ccp.datasource.identifiers.DataSource;
 import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
+import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
+import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
 import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID;
 import edu.ucdenver.ccp.datasource.identifiers.ebi.embl.EmblID;
@@ -899,14 +901,10 @@ private DataSourceIdentifier<?> resolveDatabaseIdentifer(String type, String idS
 					return new PirnrId(idStr);
 				}
 			} catch (IllegalArgumentException e) {
-				logger.warn("Invalid identifier detected: " + e.getMessage());
-				return null;
+				return new ProbableErrorDataSourceIdentifier(idStr, type, e.getMessage());
 			}
 
-			// throw new IllegalArgumentException("Unhandled identifier type: "
-			// + type + " :: " + idStr);
-			logger.warn("Unhandled identifier type: " + type + " :: " + idStr);
-			return null;
+			return new UnknownDataSourceIdentifier(idStr, type);
 		}
 	}
 

diff --git a/.../java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java b/.../java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java
@@ -73,22 +73,24 @@
  * 
  * @author Bill Baumgartner
  * 
- *         ID _________ Association(Y/N) _________ Broad Phenotype Disease Class _________ Disease
- *         Class Code _________ MeSH Disease Terms _________ Chromosom _________ Chr-Band _________
- *         _________ Gene _________ DNA Start _________ DNA End P Value Reference _________ Pubmed
- *         ID _________ Allele Author Description _________ Allele Functional Effects _________
- *         Polymophism Class _________ Gene Name _________ RefSeq _________ Population _________
- *         MeSH Geolocation _________ Submitter _________ Locus Number _________ Unigene _________
- *         Narrow Phenotype _________ Mole. Phenotype Journal Title _________ rs Number _________
- *         OMIM ID Year _________ Conclusion _________ Study Info _________ Env. Factor _________ GI
- *         Gene A _________ GI Allele of Gene A _________ GI Gene B _________ GI Allele of Gene B
- *         _________ GI Gene C _________ GI Allele of Gene C _________ GI Association? GI combine
+ *         ID _________ Association(Y/N) _________ Broad Phenotype Disease Class
+ *         _________ Disease Class Code _________ MeSH Disease Terms _________
+ *         Chromosom _________ Chr-Band _________ _________ Gene _________ DNA
+ *         Start _________ DNA End P Value Reference _________ Pubmed ID
+ *         _________ Allele Author Description _________ Allele Functional
+ *         Effects _________ Polymophism Class _________ Gene Name _________
+ *         RefSeq _________ Population _________ MeSH Geolocation _________
+ *         Submitter _________ Locus Number _________ Unigene _________ Narrow
+ *         Phenotype _________ Mole. Phenotype Journal Title _________ rs Number
+ *         _________ OMIM ID Year _________ Conclusion _________ Study Info
+ *         _________ Env. Factor _________ GI Gene A _________ GI Allele of Gene
+ *         A _________ GI Gene B _________ GI Allele of Gene B _________ GI Gene
+ *         C _________ GI Allele of Gene C _________ GI Association? GI combine
  *         Env. Factor _________ GI relevant to Disease
  */
 
-@Record(dataSource = DataSource.GAD, schemaVersion="2", comment="Schema version is 2 b/c one field was dropped: GAD/CDC", label="GAD record")
+@Record(dataSource = DataSource.GAD, schemaVersion = "2", comment = "Schema version is 2 b/c one field was dropped: GAD/CDC", label = "GAD record")
 public class GeneticAssociationDbAllTxtFileData extends SingleLineFileRecord {
-
 
 	private static final Logger logger = Logger.getLogger(GeneticAssociationDbAllTxtFileData.class);
 
@@ -422,7 +424,7 @@ public boolean hasAssociation() {
 	}
 
 	public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxtLine(Line line) {
-		String[] toks = line.getText().split("\\t",-1);
+		String[] toks = line.getText().split("\\t", -1);
 		if (toks.length < 23) {
 			logger.warn("Invalid line detected (" + line.getLineNumber() + "): " + line.getText());
 		}
@@ -454,9 +456,10 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
 		String geneName = toks[17];
 		String refseqURL = null;
 		try {
-		 refseqURL = toks[18];
+			refseqURL = toks[18];
 		} catch (ArrayIndexOutOfBoundsException e) {
-			logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length+" Message: " + e.getMessage() + " LINE: " + line.getText());
+			logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length + " Message: "
+					+ e.getMessage() + " LINE: " + line.getText());
 		}
 
 		DataSourceIdentifier<?> nucleotideId = null;
@@ -470,13 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
 			if (acc.matches("\\d+")) {
 				nucleotideId = new GiNumberID(acc);
 			} else {
-				try {
-					nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
-				} catch (IllegalArgumentException e) {
-					logger.info("tok: " + refseqURL + ";");
-					logger.warn(e.getMessage());
-					nucleotideId = null;
-				}
+				nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, refseqURL);
 			}
 		}