Skip to content

Commit

Permalink
Added argument for identifier resolution to improve error messages
Browse files Browse the repository at this point in the history
  • Loading branch information
bill-baumgartner committed Feb 9, 2016
1 parent 27e6660 commit 7bf4214
Show file tree
Hide file tree
Showing 19 changed files with 153 additions and 134 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@
import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair;
import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader;
import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver;
import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID;
import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID;
Expand All @@ -77,8 +78,8 @@
import edu.ucdenver.ccp.identifier.publication.PubMedID;

/**
* This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website:
* http://dip.doe-mbi.ucla.edu/dip/Main.cgi
* This class is used to parse DIPYYYMMDD files which can be downloaded from the
* DIP website: http://dip.doe-mbi.ucla.edu/dip/Main.cgi
*
* @author Bill Baumgartner
*
Expand Down Expand Up @@ -197,7 +198,8 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
DipInteractionType interactionType = MiOntologyIdTermPair.parseString(DipInteractionType.class,
interactionTypes[i]);
DipProcessingStatus processingStatus = getDipProcessingStatus(processingStatuses[i], line);
String firstAuthorName = null; // change if the first author column ever contains names
String firstAuthorName = null; // change if the first author column
// ever contains names
DipPublication publication = getDipPublication(firstAuthorName, pmids[i * 2], pmids[i * 2 + 1]);

experiments.add(new DipInteractionExperiment(publication, processingStatus, detectionMethod,
Expand All @@ -212,9 +214,9 @@ private Set<DipInteractionExperiment> getInteractionExperiments(String detection
* @param string
* @param string2
* @param pmids
* @return {@link DipPublication} from first author name and conversions of strings like
* "pubmed:9194558" and "pubmed:DIP-209S" into a {@link PubMedID} and a
* {@link DipPublicationId}
* @return {@link DipPublication} from first author name and conversions of
* strings like "pubmed:9194558" and "pubmed:DIP-209S" into a
* {@link PubMedID} and a {@link DipPublicationId}
*/
private DipPublication getDipPublication(String firstAuthorName, String pmidStr, String dipPubIdStr) {
PubMedID pmid;
Expand All @@ -230,7 +232,8 @@ private DipPublication getDipPublication(String firstAuthorName, String pmidStr,

/**
* @param string
* @return {@link DipProcessingStatus} parsed from a string such as: "dip:0002(small scale)"
* @return {@link DipProcessingStatus} parsed from a string such as:
* "dip:0002(small scale)"
*/
private DipProcessingStatus getDipProcessingStatus(String statusStr, String line) {
Pattern p = Pattern.compile("(dip:\\d+)\\((.*?)\\)");
Expand Down Expand Up @@ -258,8 +261,9 @@ private DipInteractor getInteractor(String interactorStr, String alternateIdsStr
}

/*
* The columns for alternate IDs and aliases are always set to "-". If this is no
* longer the case then an exception will be thrown and code changes required.
* The columns for alternate IDs and aliases are always set to
* "-". If this is no longer the case then an exception will be
* thrown and code changes required.
*/
Set<DipInteractorID> alternateIds = null;
if (!alternateIdsStr.trim().equals("-")) {
Expand Down Expand Up @@ -318,13 +322,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
return new DipInteractorID(idStr);
}
if (idStr.startsWith("refseq:")) {
try {
return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"));
// return new RefSeqID(StringUtil.removePrefix(idStr, "refseq:"));
} catch (IllegalArgumentException e) {
logger.warn("Invalid RefSeq identifier detected: " + idStr);
return null;
}
return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"), idStr);
}
if (idStr.startsWith("uniprotkb:")) {
if (idStr.contains(StringConstants.HYPHEN_MINUS)) {
Expand All @@ -333,8 +331,7 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
try {
return new UniProtID(StringUtil.removePrefix(idStr, "uniprotkb:"));
} catch (IllegalArgumentException e) {
logger.warn("Invalid UniProt identifier detected: " + idStr);
return null;
return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage());
}
}
throw new IllegalArgumentException("Unhandled identifier type: " + idStr);
Expand All @@ -347,16 +344,16 @@ private DataSourceIdentifier<?> resolveId(String idStr) {
// * MI id
// * @return id if recognized; otherwise, null
// */
// private static MolecularInteractionOntologyTermID extractMiId(String inputStr) {
// private static MolecularInteractionOntologyTermID extractMiId(String
// inputStr) {
// Pattern methodIDPattern = Pattern.compile("(MI:\\d+),?\\(");
// Matcher m = methodIDPattern.matcher(inputStr);
// if (m.find()) {
// return new MolecularInteractionOntologyTermID(m.group(1));
// }
// logger.error("Unable to locate ExperimentalMethod MI ID in String: " + inputStr);
// logger.error("Unable to locate ExperimentalMethod MI ID in String: " +
// inputStr);
// return null;
// }



}
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,6 @@
import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggCompoundID;
import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggDrugID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.MeshID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.GiNumberID;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.snp.SnpRsId;
import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID;
import edu.ucdenver.ccp.datasource.identifiers.obo.ChebiOntologyID;
Expand Down Expand Up @@ -1299,22 +1298,16 @@ private static DataSourceIdentifier<?> resolveIdentifier(String resource, String
} else if (resource.equals("GeneCards")) {
return new GeneCardId(identifier);
} else if (resource.equals("GenBank Gene Database")) {
return NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
return NucleotideAccessionResolver.resolveNucleotideAccession(identifier, "GenBank Gene Database:"
+ identifier);
} else if (resource.equals("GenBank Protein Database")) {
try {
return ProteinAccessionResolver.resolveProteinAccession(identifier);
} catch (IllegalArgumentException e) {
if (identifier.matches("\\d+")) {
return new GiNumberID(identifier);
} else {
return new ProbableErrorDataSourceIdentifier("identifier", "GenBank",
"Observed invalid GenBank protein identifier: " + identifier);
}
}
return ProteinAccessionResolver
.resolveProteinAccession(identifier, "GenBank Protein Database" + identifier);
} else if (resource.equals("GenBank")) {
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier);
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier,
"GenBank:" + identifier);
if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) {
return ProteinAccessionResolver.resolveProteinAccession(identifier);
return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier);
}
} else if (resource.equals("UniProtKB")) {
return new UniProtID(identifier);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -500,7 +500,7 @@ protected abstract T invokeConstructor(E idLineContents, List<D> accessionNumber
private EmblAssemblyInformation parseASLine(String line) {
String[] toks = line.split("\\s+");
String localSpan = toks[1];
DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2]);
DataSourceIdentifier<?> primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2], toks[2]);
String primarySpan = toks[3];
boolean originatesFromComplementary = (toks.length == 5 && toks[4].trim().equalsIgnoreCase("c")) ? true : false;
return new EmblAssemblyInformation(localSpan, primaryIdentifier, primarySpan, originatesFromComplementary);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt
if (acc.matches("\\d+")) {
nucleotideId = new GiNumberID(acc);
} else {
nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, refseqURL);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -564,9 +564,9 @@ private Set<DataSourceIdentifier<?>> resolveAccessionNumbers(String accListStr)
Set<DataSourceIdentifier<?>> accNumbers = new HashSet<DataSourceIdentifier<?>>();
if (!accListStr.isEmpty()) {
for (String acc : accListStr.split(",")) {
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc);
if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) {
DataSourceIdentifier<String> proAccId = ProteinAccessionResolver.resolveProteinAccession(acc);
DataSourceIdentifier<String> proAccId = ProteinAccessionResolver.resolveProteinAccession(acc, acc);
accNumbers.add(proAccId);
} else {
accNumbers.add(nucAccId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -115,9 +115,9 @@ protected HprdIdMappingsTxtFileData parseRecordFromLine(Line line) {
}

private DataSourceIdentifier<?> resolveAccession(String acc) {
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc);
DataSourceIdentifier<String> nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc);
if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) {
return ProteinAccessionResolver.resolveProteinAccession(acc);
return ProteinAccessionResolver.resolveProteinAccession(acc, acc);
}
return nucAccId;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@
import edu.ucdenver.ccp.datasource.identifiers.irefweb.IrigId;
import edu.ucdenver.ccp.datasource.identifiers.irefweb.RigId;
import edu.ucdenver.ccp.datasource.identifiers.other.ImexId;
import edu.ucdenver.ccp.identifier.publication.PubMedID;

@Data
@Record(dataSource = DataSource.IREFWEB, label="interaction")
Expand All @@ -57,7 +56,7 @@ public class IRefWebInteraction implements DataRecord {
@RecordField(comment = "Notes: According to MITAB2.6 format this column should contain a pipe-delimited list of author surnames in which the interaction has been shown.\nThis column will usually include only one author name reference. However, some experimental evidences have secondary references which could be included here. This filed also includes references which are not author names as in the following examples:\nOPHID Predicted Protein Interaction\nHPRD Text Mining Confirmation\nMINT Text Mining Confirmation")
private final String author;
@RecordField(comment = "Notes: This is a non-redundant list of PubMed identifiers pointing to literature that supports the interaction. According to MITAB2.6 format, this column should contain a pipe-delimited set of databaseName:identifier pairs such as pubmed:12345. The source database name is always pubmed.")
private final Set<PubMedID> pmids;
private final Set<DataSourceIdentifier<?>> pmids;
@RecordField
private final IRefWebInteractionType interactionType;
@RecordField(comment = "source interaction-database and accessions.\nExample: intact:EBI-761694|rigid:3ERiFkUFsm7ZUHIRJTx8ZlHILRA|irigid:1234|edgetype:X\nNotes: Each reference is presented as a database name:identifier pair.\nChange: The source database is listed first. Additional information is pipe-delimited and presented here for the convenience of PSICQUIC web-service users (these services presently truncate this file at column 15 as they only support MITAB2.5). See columns 35,45,53.\nThe source database names that appear in this column are taken from the PSI-MI controlled vocabulary at the following location (where possible): http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI\nIf an interaction record identifier is not provided by the source database, this entry will appear as database-name:- with the identifier region replaced with a dash (-).")
Expand Down
Loading

0 comments on commit 7bf4214

Please sign in to comment.