From b96d601c9d4b41f4d316131dc547c2dd1a85e556 Mon Sep 17 00:00:00 2001 From: Marc Daya Date: Tue, 19 Jul 2016 15:18:15 -0600 Subject: [PATCH] Handle unknown external ID in interpro2ipr files. Since we're not in control of the external datasources that are used by datasource providers, validating all external IDs makes our code brittle. Since we don't use sources that we don't know about, we can simply gloss over the external IDs that we don't recognise. Doing so yields triples of the form: . . . . . . "cd01991"@en . --- .../InterProExternalReferenceFactory.java | 11 +- .../hgnc/HgncDownloadFileParser.java | 2 +- .../irefweb/IRefWebPsiMitab2_6FileParser.java | 10 +- .../pharmgkb/PharmGkbGeneFileParser.java | 2 +- .../pharmgkb/PharmGkbRelationFileParser.java | 306 ++++++++++-------- .../fileparsers/pro/ProMappingFileParser.java | 2 +- .../rgd/RgdAnnotationFileIdResolver.java | 2 +- .../InterProProtein2IprDatFileParserTest.java | 133 ++++++++ .../pro/ProMappingFileParserTest.java | 16 +- .../identifiers/DataSourceIdResolver.java | 4 +- .../UnknownDataSourceIdentifier.java | 43 +++ 11 files changed, 375 insertions(+), 156 deletions(-) create mode 100644 datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java index 738c8a7..1cba8fa 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java @@ -34,6 +34,7 @@ */ import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.Gene3dID; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.HamapAnnotationRuleID; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.PantherID; @@ -63,7 +64,9 @@ public class InterProExternalReferenceFactory { private static final String PRODOM_PREFIX = "PD"; - public static DataSourceIdentifier parseExternalReference(String databaseReferenceID) { + public static DataSourceIdentifier parseExternalReference( + String databaseReferenceID) + { if (databaseReferenceID.startsWith(PFAM_PREFIX)) return new PfamID(databaseReferenceID); if (databaseReferenceID.startsWith(TIGRFAMS_PREFIX)) @@ -87,9 +90,9 @@ public static DataSourceIdentifier parseExternalReference(String databas if (databaseReferenceID.startsWith(HAMAP_PREFIX)) return new HamapAnnotationRuleID(databaseReferenceID); if (databaseReferenceID.startsWith(PRODOM_PREFIX)) - return new ProDomID(databaseReferenceID); - throw new IllegalArgumentException(String.format("Unknown external database ID type for ID: %s", - databaseReferenceID)); + return new ProDomID(databaseReferenceID); + + return new UnknownDataSourceIdentifier(databaseReferenceID); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java index 690b3af..7b63a72 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java @@ -558,7 +558,7 @@ private DataSourceIdentifier resolveSpecialistId(String idStr, String link) { return new SlcId(idStr); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /** diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java index 4643a13..10f0230 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java @@ -334,7 +334,7 @@ private Set> resolveInteractionDbIds(String interactionI } else if (id.startsWith("HPRD")) { ids.add(new HprdID(StringUtil.removePrefix(id, "HPRD:"))); } else { - ids.add(new UnknownDataSourceIdentifier(id, null)); + ids.add(new UnknownDataSourceIdentifier(id)); } } catch (IllegalArgumentException e) { ids.add(new ProbableErrorDataSourceIdentifier(id, null, e.getMessage())); @@ -360,10 +360,10 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return null; } if (idStr.startsWith("xx:")) { - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } if (idStr.startsWith("other:")) { - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } if (idStr.equals("null")) { return null; @@ -486,7 +486,7 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /** @@ -683,7 +683,7 @@ private DataSourceIdentifier resolveAliasId(String aliasStr) { } else if (aliasStr.startsWith("hgnc:")) { return new HgncGeneSymbolID(StringUtil.removePrefix(aliasStr, "hgnc:")); } - return new UnknownDataSourceIdentifier(aliasStr, null); + return new UnknownDataSourceIdentifier(aliasStr); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java index 23327d1..dbc23b3 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java @@ -274,7 +274,7 @@ private DataSourceIdentifier resolveCrossRefId(String refStr) { } else if (refStr.startsWith(URL_PREFIX)) { return new CrossReferenceUrl(StringUtil.removePrefix(refStr, URL_PREFIX)); } else { - return new UnknownDataSourceIdentifier(refStr, null); + return new UnknownDataSourceIdentifier(refStr); } } catch (IllegalArgumentException e) { return new ProbableErrorDataSourceIdentifier(refStr, null, e.getMessage()); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java index 7159666..d9f7282 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java @@ -2,9 +2,10 @@ /* * #%L - * Colorado Computational Pharmacology's common module + * Colorado Computational Pharmacology's datasource + * project * %% - * Copyright (C) 2012 - 2015 Regents of the University of Colorado + * Copyright (C) 2012 - 2016 Regents of the University of Colorado * %% * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -33,139 +34,170 @@ * #L% */ - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; - -import org.apache.log4j.Logger; - -import edu.ucdenver.ccp.common.file.CharacterEncoding; -import edu.ucdenver.ccp.common.file.reader.Line; -import edu.ucdenver.ccp.common.string.RegExPatterns; -import edu.ucdenver.ccp.common.string.StringConstants; -import edu.ucdenver.ccp.common.string.StringUtil; -import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; -import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +/* + * Colorado Computational Pharmacology's common module + * + * Copyright (C) 2012 - 2015 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +import org.apache.log4j.Logger; + +import edu.ucdenver.ccp.common.file.CharacterEncoding; +import edu.ucdenver.ccp.common.file.reader.Line; +import edu.ucdenver.ccp.common.string.RegExPatterns; +import edu.ucdenver.ccp.common.string.StringConstants; +import edu.ucdenver.ccp.common.string.StringUtil; +import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; +import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.RefSnpID; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbHaplotypeId; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbVariantLocationId; -import edu.ucdenver.ccp.identifier.publication.PubMedID; - -public class PharmGkbRelationFileParser extends SingleLineFileRecordReader { - - private static final Logger logger = Logger.getLogger(PharmGkbRelationFileParser.class); - - private static final String HEADER = "Entity1_id\tEntity1_name\tEntity1_type\tEntity2_id\tEntity2_name\tEntity2_type\tEvidence\tAssociation\tPK\tPD\tPMIDs"; - - private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; - - private static final String PHARMGKB_ID_PREFIX = "PA"; - - private static final String REFSNP_ID_PATTERN = "rs\\d+"; - - private static final Object ENTITY_TYPE_HAPLOTYPE = "Haplotype"; - - private static final Object ENTITY_TYPE_VARIANT_LOCATION = "VariantLocation"; - - public PharmGkbRelationFileParser(File dataFile, CharacterEncoding encoding) throws IOException { - super(dataFile, encoding, null); - } - - @Override - protected String getFileHeader() throws IOException { - return readLine().getText(); - } - - @Override - protected String getExpectedFileHeader() throws IOException { - return HEADER; - } - - @Override - protected PharmGkbRelationFileRecord parseRecordFromLine(Line line) { - String[] toks = line.getText().split(RegExPatterns.TAB, -1); - String entity1Name = toks[1]; - String entity1Type = toks[2]; - Set> entity1Id = resolveEntityId(toks[0], entity1Type); - String entity2Name = toks[4]; - String entity2Type = toks[5]; - Set> entity2Id = resolveEntityId(toks[3], entity2Type); - Set evidence = new HashSet(StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, null, - RemoveFieldEnclosures.FALSE)); - String association = toks[7]; - String pk = toks[8]; - String pd = toks[9]; - Collection pmids = getPmids(toks[10]); - return new PharmGkbRelationFileRecord(entity1Id, entity1Name, entity1Type, entity2Id, entity2Name, entity2Type, - evidence, association, pk, pd, pmids, line.getByteOffset(), line.getLineNumber()); - } - - /** - * entities in the relationships.tsv file can be PharmGkb accession IDs (e.g. PA450626), RefSnp - * IDs (e.g. rs10209881), Haplotypes (e.g. CYP2A6*1B10), chromosome positions (e.g. - * chr20:48184659 (hg19)) - * - * @param string - * @return - */ - private Set> resolveEntityId(String idStr, String entityType) { - Set> ids = new HashSet>(); - String[] toks = idStr.split(","); - for (String id : toks) { - if (idStr.startsWith(PHARMGKB_ID_PREFIX)) { - ids.add(new PharmGkbID(id)); - } else if (idStr.matches(REFSNP_ID_PATTERN)) { - ids.add(new RefSnpID(id)); - } else if (entityType.equals(ENTITY_TYPE_HAPLOTYPE)) { - ids.add(new PharmGkbHaplotypeId(id)); - } else if (entityType.equals(ENTITY_TYPE_VARIANT_LOCATION)) { - ids.add(new PharmGkbVariantLocationId(id)); - } else { - ids.add(new UnknownDataSourceIdentifier(id, null)); - } - } - return ids; - - } - - /** - * @param string - * @return - */ - private Collection getPmids(String pmidStr) { - Collection pmids = new ArrayList(); - if (!pmidStr.isEmpty()) { - for (String pmid : pmidStr.split(StringConstants.SEMICOLON)) { - if (pmid.startsWith(StringConstants.COLON)) { - // there are some PMIDs prefixed by a colon in the relationships.tsv file, e.g. - // ":17522595". Here we remove the leading colon if it is present. - pmid = StringUtil.removePrefix(pmid, StringConstants.COLON); - } - if (pmid.startsWith(StringConstants.LEFT_SQUARE_BRACKET)) { - // there are some PMIDs prefixed by a [ in the relationships.tsv file, e.g. - // "[11866883". Here we remove the leading [ if it is present. - pmid = StringUtil.removePrefix(pmid, StringConstants.LEFT_SQUARE_BRACKET); - } - if (pmid.endsWith(StringConstants.RIGHT_SQUARE_BRACKET)) { - // there are some PMIDs suffixed by a } in the relationships.tsv file, e.g. - // "22020825]". Here we remove the trailing ] if it is present. - pmid = StringUtil.removeSuffix(pmid, StringConstants.RIGHT_SQUARE_BRACKET); - } - if (pmid.matches(RegExPatterns.HAS_NUMBERS_ONLY)) { - pmids.add(new PubMedID(pmid)); - } else { - logger.warn("Unhandled PMID format: " + pmid); - } - } - } - return pmids; - } - -} +import edu.ucdenver.ccp.datasource.identifiers.ncbi.RefSnpID; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbHaplotypeId; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbVariantLocationId; +import edu.ucdenver.ccp.identifier.publication.PubMedID; + +public class PharmGkbRelationFileParser extends SingleLineFileRecordReader { + + private static final Logger logger = Logger.getLogger(PharmGkbRelationFileParser.class); + + private static final String HEADER = "Entity1_id\tEntity1_name\tEntity1_type\tEntity2_id\tEntity2_name\tEntity2_type\tEvidence\tAssociation\tPK\tPD\tPMIDs"; + + private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; + + private static final String PHARMGKB_ID_PREFIX = "PA"; + + private static final String REFSNP_ID_PATTERN = "rs\\d+"; + + private static final Object ENTITY_TYPE_HAPLOTYPE = "Haplotype"; + + private static final Object ENTITY_TYPE_VARIANT_LOCATION = "VariantLocation"; + + public PharmGkbRelationFileParser(File dataFile, CharacterEncoding encoding) throws IOException { + super(dataFile, encoding, null); + } + + @Override + protected String getFileHeader() throws IOException { + return readLine().getText(); + } + + @Override + protected String getExpectedFileHeader() throws IOException { + return HEADER; + } + + @Override + protected PharmGkbRelationFileRecord parseRecordFromLine(Line line) { + String[] toks = line.getText().split(RegExPatterns.TAB, -1); + String entity1Name = toks[1]; + String entity1Type = toks[2]; + Set> entity1Id = resolveEntityId(toks[0], entity1Type); + String entity2Name = toks[4]; + String entity2Type = toks[5]; + Set> entity2Id = resolveEntityId(toks[3], entity2Type); + Set evidence = new HashSet(StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, null, + RemoveFieldEnclosures.FALSE)); + String association = toks[7]; + String pk = toks[8]; + String pd = toks[9]; + Collection pmids = getPmids(toks[10]); + return new PharmGkbRelationFileRecord(entity1Id, entity1Name, entity1Type, entity2Id, entity2Name, entity2Type, + evidence, association, pk, pd, pmids, line.getByteOffset(), line.getLineNumber()); + } + + /** + * entities in the relationships.tsv file can be PharmGkb accession IDs (e.g. PA450626), RefSnp + * IDs (e.g. rs10209881), Haplotypes (e.g. CYP2A6*1B10), chromosome positions (e.g. + * chr20:48184659 (hg19)) + * + * @param string + * @return + */ + private Set> resolveEntityId(String idStr, String entityType) { + Set> ids = new HashSet>(); + String[] toks = idStr.split(","); + for (String id : toks) { + if (idStr.startsWith(PHARMGKB_ID_PREFIX)) { + ids.add(new PharmGkbID(id)); + } else if (idStr.matches(REFSNP_ID_PATTERN)) { + ids.add(new RefSnpID(id)); + } else if (entityType.equals(ENTITY_TYPE_HAPLOTYPE)) { + ids.add(new PharmGkbHaplotypeId(id)); + } else if (entityType.equals(ENTITY_TYPE_VARIANT_LOCATION)) { + ids.add(new PharmGkbVariantLocationId(id)); + } else { + ids.add(new UnknownDataSourceIdentifier(id)); + } + } + return ids; + + } + + /** + * @param string + * @return + */ + private Collection getPmids(String pmidStr) { + Collection pmids = new ArrayList(); + if (!pmidStr.isEmpty()) { + for (String pmid : pmidStr.split(StringConstants.SEMICOLON)) { + if (pmid.startsWith(StringConstants.COLON)) { + // there are some PMIDs prefixed by a colon in the relationships.tsv file, e.g. + // ":17522595". Here we remove the leading colon if it is present. + pmid = StringUtil.removePrefix(pmid, StringConstants.COLON); + } + if (pmid.startsWith(StringConstants.LEFT_SQUARE_BRACKET)) { + // there are some PMIDs prefixed by a [ in the relationships.tsv file, e.g. + // "[11866883". Here we remove the leading [ if it is present. + pmid = StringUtil.removePrefix(pmid, StringConstants.LEFT_SQUARE_BRACKET); + } + if (pmid.endsWith(StringConstants.RIGHT_SQUARE_BRACKET)) { + // there are some PMIDs suffixed by a } in the relationships.tsv file, e.g. + // "22020825]". Here we remove the trailing ] if it is present. + pmid = StringUtil.removeSuffix(pmid, StringConstants.RIGHT_SQUARE_BRACKET); + } + if (pmid.matches(RegExPatterns.HAS_NUMBERS_ONLY)) { + pmids.add(new PubMedID(pmid)); + } else { + logger.warn("Unhandled PMID format: " + pmid); + } + } + } + return pmids; + } + +} diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java index 94e8bb2..e09c71d 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java @@ -139,6 +139,6 @@ private DataSourceIdentifier resolveId(String idStr) { } catch (IllegalArgumentException e) { return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java index f477e67..c1e5921 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java @@ -117,7 +117,7 @@ public DataSourceIdentifier resolveId(String idStr) { if (idStr.startsWith("UniProtKB:")) { return new UniProtID(idStr.substring(10)); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /* diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java new file mode 100644 index 0000000..d1110d7 --- /dev/null +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java @@ -0,0 +1,133 @@ + +/* + * Colorado Computational Pharmacology's datasource module + * + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package edu.ucdenver.ccp.datasource.fileparsers.interpro; + +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +import edu.ucdenver.ccp.common.file.CharacterEncoding; +import edu.ucdenver.ccp.common.file.reader.Line; +import edu.ucdenver.ccp.datasource.fileparsers.ebi.interpro.InterProProtein2IprDatFileData; +import edu.ucdenver.ccp.datasource.fileparsers.ebi.interpro.InterProProtein2IprDatFileParser; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; + +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; +import static org.hamcrest.core.IsInstanceOf.instanceOf; + +public class InterProProtein2IprDatFileParserTest { + + @Test + public void parseLineWithUnknownExternalId() + throws Exception + { + InterProProtein2IprDatFileData dat = + new TestParser().parseRecordFromLine( + makeLine( + "A0A004\tIPR001962\tAsparagine synthase\tcd01991\t241\t565")); + // Doesn't throw an error for unknown IDs. + assertNotNull(dat); + // External ID handled correctly. + assertThat(dat.getExternalReference(), + instanceOf(UnknownDataSourceIdentifier.class)); + } + + /* --------------------------------------------------------------------- */ + + static class TestParser extends InterProProtein2IprDatFileParser + { + TestParser() + throws IOException + { + super(makeTempFile(), CharacterEncoding.US_ASCII); + } + + private static File makeTempFile() + throws IOException + { + File f = File.createTempFile("protein2ipr", null); + f.deleteOnExit(); + return f; + } + + public InterProProtein2IprDatFileData parseRecordFromLine(Line l) { + return super.parseRecordFromLine(l); + } + } + + private Line makeLine(String textSansTerminator) { + return new Line(textSansTerminator, + Line.LineTerminator.CR, + 0, // character offset + 0, // code point offset + 1, // line number + -1); // byte offset + } +} diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java index 8c9f3f9..e65f8d9 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java @@ -109,16 +109,24 @@ private void validateRecord(ProMappingRecord record, ProteinOntologyId expectedP } private void validateRecord1(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new HgncID("HGNC:11773")); + validateRecord(record, + new ProteinOntologyId("PR:000000005"), + "is_a", + new HgncID("HGNC:11773")); } private void validateRecord2(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new UnknownDataSourceIdentifier( - "UniProtKB_VAR:VAR_022359", null)); + validateRecord(record, + new ProteinOntologyId("PR:000000005"), + "is_a", + new UnknownDataSourceIdentifier("UniProtKB_VAR:VAR_022359")); } private void validateRecord3(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000006"), "exact", new UniProtID("P37173")); + validateRecord(record, + new ProteinOntologyId("PR:000000006"), + "exact", + new UniProtID("P37173")); } } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java index 6b0fddd..cc4d6c0 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java @@ -452,7 +452,7 @@ else if (geneIDStr.startsWith("NCBITaxon:")) return new NcbiTaxonomyID(StringUtil.removePrefix(geneIDStr, "NCBITaxon:")); logger.warn(String.format("Unhandled gene ID format: %s. Creating UnknownDataSourceIdentifier.", geneIDStr)); - return new UnknownDataSourceIdentifier(geneIDStr, null); + return new UnknownDataSourceIdentifier(geneIDStr); } catch (IllegalArgumentException e) { logger.warn("Invalid ID detected... " + e.getMessage()); return new ProbableErrorDataSourceIdentifier(geneIDStr, null, e.getMessage()); @@ -481,7 +481,7 @@ private static DataSourceIdentifier resolveInteractionID(String interactionID logger.warn(String.format("Unknown interaction ID format: %s. Cannot create DataElementIdentifier.", interactionIDStr)); - return new UnknownDataSourceIdentifier(interactionIDStr, null); + return new UnknownDataSourceIdentifier(interactionIDStr); } /** diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java index 14a91db..11c26da 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java @@ -34,6 +34,38 @@ * #L% */ +/* + * Colorado Computational Pharmacology's datasource project + * + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + public class UnknownDataSourceIdentifier extends DataSourceIdentifier { private final String dataSourceStr; @@ -43,6 +75,17 @@ public UnknownDataSourceIdentifier(String resourceID, String dataSourceStr) { this.dataSourceStr = dataSourceStr; } + /** + * Constructor that may be used when a canonical identifier for the source + * in which the resource identifier is defined is not available. + * + * @param resourceID + * The identifier for the resource in the external, unknown source. + */ + public UnknownDataSourceIdentifier(String resourceID) { + this(resourceID, null); + } + @Override public String validate(String resourceID) throws IllegalArgumentException { return resourceID;