Added argument for identifier resolution to improve error messages

ekwhite · Feb 9, 2016 · 7bf4214 · 7bf4214
1 parent 27e6660
commit 7bf4214
Show file tree

Hide file tree

Showing 19 changed files with 153 additions and 134 deletions.
diff --git a/...sers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java b/...sers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java
@@ -68,6 +68,7 @@
 import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair;
 import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader;
 import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
+import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
 import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID;
 import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
@@ -77,8 +78,8 @@
 import edu.ucdenver.ccp.identifier.publication.PubMedID;
 
 /**
- * This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website:
- * http://dip.doe-mbi.ucla.edu/dip/Main.cgi
+ * This class is used to parse DIPYYYMMDD files which can be downloaded from the
+ * DIP website: http://dip.doe-mbi.ucla.edu/dip/Main.cgi
  * 
  * @author Bill Baumgartner
  * 
@@ -197,7 +198,8 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
 			DipInteractionType interactionType = MiOntologyIdTermPair.parseString(DipInteractionType.class,
 					interactionTypes[i]);
 			DipProcessingStatus processingStatus = getDipProcessingStatus(processingStatuses[i], line);
-			String firstAuthorName = null; // change if the first author column ever contains names
+			String firstAuthorName = null; // change if the first author column
+											// ever contains names
 			DipPublication publication = getDipPublication(firstAuthorName, pmids[i * 2], pmids[i * 2 + 1]);
 
 			experiments.add(new DipInteractionExperiment(publication, processingStatus, detectionMethod,
@@ -212,9 +214,9 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
 	 * @param string
 	 * @param string2
 	 * @param pmids
-	 * @return {@link DipPublication} from first author name and conversions of strings like
-	 *         "pubmed:9194558" and "pubmed:DIP-209S" into a {@link PubMedID} and a
-	 *         {@link DipPublicationId}
+	 * @return {@link DipPublication} from first author name and conversions of
+	 *         strings like "pubmed:9194558" and "pubmed:DIP-209S" into a
+	 *         {@link PubMedID} and a {@link DipPublicationId}
 	 */
 	private DipPublication getDipPublication(String firstAuthorName, String pmidStr, String dipPubIdStr) {
 		PubMedID pmid;
@@ -230,7 +232,8 @@ private DipPublication getDipPublication(String firstAuthorName, String pmidStr,
 
 	/**
 	 * @param string
-	 * @return {@link DipProcessingStatus} parsed from a string such as: "dip:0002(small scale)"
+	 * @return {@link DipProcessingStatus} parsed from a string such as:
+	 *         "dip:0002(small scale)"
 	 */
 	private DipProcessingStatus getDipProcessingStatus(String statusStr, String line) {
 		Pattern p = Pattern.compile("(dip:\\d+)\\((.*?)\\)");
@@ -258,8 +261,9 @@ private DipInteractor getInteractor(String interactorStr, String alternateIdsStr
 				}
 
 				/*
-				 * The columns for alternate IDs and aliases are always set to "-". If this is no
-				 * longer the case then an exception will be thrown and code changes required.
+				 * The columns for alternate IDs and aliases are always set to
+				 * "-". If this is no longer the case then an exception will be
+				 * thrown and code changes required.
 				 */
 				Set<DipInteractorID> alternateIds = null;
 				if (!alternateIdsStr.trim().equals("-")) {
@@ -318,13 +322,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 			return new DipInteractorID(idStr);
 		}
 		if (idStr.startsWith("refseq:")) {
-			try {
-				return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"));
-				// return new RefSeqID(StringUtil.removePrefix(idStr, "refseq:"));
-			} catch (IllegalArgumentException e) {
-				logger.warn("Invalid RefSeq identifier detected: " + idStr);
-				return null;
-			}
+			return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"), idStr);
 		}
 		if (idStr.startsWith("uniprotkb:")) {
 			if (idStr.contains(StringConstants.HYPHEN_MINUS)) {
@@ -333,8 +331,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 			try {
 				return new UniProtID(StringUtil.removePrefix(idStr, "uniprotkb:"));
 			} catch (IllegalArgumentException e) {
-				logger.warn("Invalid UniProt identifier detected: " + idStr);
-				return null;
+				return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage());
 			}
 		}
 		throw new IllegalArgumentException("Unhandled identifier type: " + idStr);
@@ -347,16 +344,16 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
 	// * MI id
 	// * @return id if recognized; otherwise, null
 	// */
-	// private static MolecularInteractionOntologyTermID extractMiId(String inputStr) {
+	// private static MolecularInteractionOntologyTermID extractMiId(String
+	// inputStr) {
 	// Pattern methodIDPattern = Pattern.compile("(MI:\\d+),?\\(");
 	// Matcher m = methodIDPattern.matcher(inputStr);
 	// if (m.find()) {
 	// return new MolecularInteractionOntologyTermID(m.group(1));
 	// }
-	// logger.error("Unable to locate ExperimentalMethod MI ID in String: " + inputStr);
+	// logger.error("Unable to locate ExperimentalMethod MI ID in String: " +
+	// inputStr);
 	// return null;
 	// }
 
-
-
 }
diff --git a/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java b/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java
@@ -160,7 +160,6 @@
 import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggCompoundID;
 import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggDrugID;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.MeshID;
-import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.GiNumberID;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.snp.SnpRsId;
 import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
 import edu.ucdenver.ccp.datasource.identifiers.obo.ChebiOntologyID;
@@ -1299,22 +1298,16 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
 		} else if (resource.equals("GeneCards")) {
 			return new GeneCardId(identifier);
 		} else if (resource.equals("GenBank Gene Database")) {
-			return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
+			return NucleotideAccessionResolver.resolveNucleotideAccession(identifier, "GenBank Gene Database:"
+					+ identifier);
 		} else if (resource.equals("GenBank Protein Database")) {
-			try {
-				return ProteinAccessionResolver.resolveProteinAccession(identifier);
-			} catch (IllegalArgumentException e) {
-				if (identifier.matches("\\d+")) {
-					return new GiNumberID(identifier);
-				} else {
-					return new ProbableErrorDataSourceIdentifier("identifier", "GenBank",
-							"Observed invalid GenBank protein identifier: " + identifier);
-				}
-			}
+			return ProteinAccessionResolver
+					.resolveProteinAccession(identifier, "GenBank Protein Database" + identifier);
 		} else if (resource.equals("GenBank")) {
-			DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
+			DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier,
+					"GenBank:" + identifier);
 			if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) {
-				return ProteinAccessionResolver.resolveProteinAccession(identifier);
+				return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier);
 			}
 		} else if (resource.equals("UniProtKB")) {
 			return new UniProtID(identifier);

diff --git a/.../edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java b/.../edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java
@@ -500,7 +500,7 @@ protected abstract T invokeConstructor(E idLineContents, List<D> accessionNumber
 	private EmblAssemblyInformation parseASLine(String line) {
 		String[] toks = line.split("\\s+");
 		String localSpan = toks[1];
-		DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2]);
+		DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2], toks[2]);
 		String primarySpan = toks[3];
 		boolean originatesFromComplementary = (toks.length == 5 && toks[4].trim().equalsIgnoreCase("c")) ? true : false;
 		return new EmblAssemblyInformation(localSpan, primaryIdentifier, primarySpan, originatesFromComplementary);

diff --git a/.../java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java b/.../java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java
@@ -473,7 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
 			if (acc.matches("\\d+")) {
 				nucleotideId = new GiNumberID(acc);
 			} else {
-				nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
+				nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, refseqURL);
 			}
 		}
 

diff --git a/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/...rs/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java
@@ -564,9 +564,9 @@ private Set<DataSourceIdentifier<?>> resolveAccessionNumbers(String accListStr)
 		Set<DataSourceIdentifier<?>> accNumbers = new HashSet<DataSourceIdentifier<?>>();
 		if (!accListStr.isEmpty()) {
 			for (String acc : accListStr.split(",")) {
-				DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
+				DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc);
 				if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) {
-					DataSourceIdentifier<String> proAccId = ProteinAccessionResolver.resolveProteinAccession(acc);
+					DataSourceIdentifier<String> proAccId = ProteinAccessionResolver.resolveProteinAccession(acc, acc);
 					accNumbers.add(proAccId);
 				} else {
 					accNumbers.add(nucAccId);

diff --git a/...c/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java b/...c/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java
@@ -115,9 +115,9 @@ protected HprdIdMappingsTxtFileData parseRecordFromLine(Line line) {
 	}
 
 	private DataSourceIdentifier<?> resolveAccession(String acc) {
-		DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
+		DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc);
 		if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) {
-			return ProteinAccessionResolver.resolveProteinAccession(acc);
+			return ProteinAccessionResolver.resolveProteinAccession(acc, acc);
 		}
 		return nucAccId;
 	}

diff --git a/...ers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java b/...ers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java
@@ -46,7 +46,6 @@
 import edu.ucdenver.ccp.datasource.identifiers.irefweb.IrigId;
 import edu.ucdenver.ccp.datasource.identifiers.irefweb.RigId;
 import edu.ucdenver.ccp.datasource.identifiers.other.ImexId;
-import edu.ucdenver.ccp.identifier.publication.PubMedID;
 
 @Data
 @Record(dataSource = DataSource.IREFWEB, label="interaction")
@@ -57,7 +56,7 @@ public class IRefWebInteraction implements DataRecord {
 	@RecordField(comment = "Notes: According to MITAB2.6 format this column should contain a pipe-delimited list of author surnames in which the interaction has been shown.\nThis column will usually include only one author name reference. However, some experimental evidences have secondary references which could be included here. This filed also includes references which are not author names as in the following examples:\nOPHID Predicted Protein Interaction\nHPRD Text Mining Confirmation\nMINT Text Mining Confirmation")
 	private final String author;
 	@RecordField(comment = "Notes: This is a non-redundant list of PubMed identifiers pointing to literature that supports the interaction. According to MITAB2.6 format, this column should contain a pipe-delimited set of databaseName:identifier pairs such as pubmed:12345. The source database name is always pubmed.")
-	private final Set<PubMedID> pmids;
+	private final Set<DataSourceIdentifier<?>> pmids;
 	@RecordField
 	private final IRefWebInteractionType interactionType;
 	@RecordField(comment = "source interaction-database and accessions.\nExample: intact:EBI-761694|rigid:3ERiFkUFsm7ZUHIRJTx8ZlHILRA|irigid:1234|edgetype:X\nNotes: Each reference is presented as a database name:identifier pair.\nChange: The source database is listed first. Additional information is pipe-delimited and presented here for the convenience of PSICQUIC web-service users (these services presently truncate this file at column 15 as they only support MITAB2.5). See columns 35,45,53.\nThe source database names that appear in this column are taken from the PSI-MI controlled vocabulary at the following location (where possible): http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI\nIf an interaction record identifier is not provided by the source database, this entry will appear as database-name:- with the identifier region replaced with a dash (-).")