diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java index 738c8a7..1cba8fa 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProExternalReferenceFactory.java @@ -34,6 +34,7 @@ */ import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.Gene3dID; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.HamapAnnotationRuleID; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.PantherID; @@ -63,7 +64,9 @@ public class InterProExternalReferenceFactory { private static final String PRODOM_PREFIX = "PD"; - public static DataSourceIdentifier parseExternalReference(String databaseReferenceID) { + public static DataSourceIdentifier parseExternalReference( + String databaseReferenceID) + { if (databaseReferenceID.startsWith(PFAM_PREFIX)) return new PfamID(databaseReferenceID); if (databaseReferenceID.startsWith(TIGRFAMS_PREFIX)) @@ -87,9 +90,9 @@ public static DataSourceIdentifier parseExternalReference(String databas if (databaseReferenceID.startsWith(HAMAP_PREFIX)) return new HamapAnnotationRuleID(databaseReferenceID); if (databaseReferenceID.startsWith(PRODOM_PREFIX)) - return new ProDomID(databaseReferenceID); - throw new IllegalArgumentException(String.format("Unknown external database ID type for ID: %s", - databaseReferenceID)); + return new ProDomID(databaseReferenceID); + + return new UnknownDataSourceIdentifier(databaseReferenceID); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java index 690b3af..7b63a72 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java @@ -558,7 +558,7 @@ private DataSourceIdentifier resolveSpecialistId(String idStr, String link) { return new SlcId(idStr); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /** diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java index 4643a13..10f0230 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java @@ -334,7 +334,7 @@ private Set> resolveInteractionDbIds(String interactionI } else if (id.startsWith("HPRD")) { ids.add(new HprdID(StringUtil.removePrefix(id, "HPRD:"))); } else { - ids.add(new UnknownDataSourceIdentifier(id, null)); + ids.add(new UnknownDataSourceIdentifier(id)); } } catch (IllegalArgumentException e) { ids.add(new ProbableErrorDataSourceIdentifier(id, null, e.getMessage())); @@ -360,10 +360,10 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return null; } if (idStr.startsWith("xx:")) { - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } if (idStr.startsWith("other:")) { - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } if (idStr.equals("null")) { return null; @@ -486,7 +486,7 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /** @@ -683,7 +683,7 @@ private DataSourceIdentifier resolveAliasId(String aliasStr) { } else if (aliasStr.startsWith("hgnc:")) { return new HgncGeneSymbolID(StringUtil.removePrefix(aliasStr, "hgnc:")); } - return new UnknownDataSourceIdentifier(aliasStr, null); + return new UnknownDataSourceIdentifier(aliasStr); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java index 23327d1..dbc23b3 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java @@ -274,7 +274,7 @@ private DataSourceIdentifier resolveCrossRefId(String refStr) { } else if (refStr.startsWith(URL_PREFIX)) { return new CrossReferenceUrl(StringUtil.removePrefix(refStr, URL_PREFIX)); } else { - return new UnknownDataSourceIdentifier(refStr, null); + return new UnknownDataSourceIdentifier(refStr); } } catch (IllegalArgumentException e) { return new ProbableErrorDataSourceIdentifier(refStr, null, e.getMessage()); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java index 7159666..d9f7282 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java @@ -2,9 +2,10 @@ /* * #%L - * Colorado Computational Pharmacology's common module + * Colorado Computational Pharmacology's datasource + * project * %% - * Copyright (C) 2012 - 2015 Regents of the University of Colorado + * Copyright (C) 2012 - 2016 Regents of the University of Colorado * %% * Redistribution and use in source and binary forms, with or without modification, * are permitted provided that the following conditions are met: @@ -33,139 +34,170 @@ * #L% */ - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; - -import org.apache.log4j.Logger; - -import edu.ucdenver.ccp.common.file.CharacterEncoding; -import edu.ucdenver.ccp.common.file.reader.Line; -import edu.ucdenver.ccp.common.string.RegExPatterns; -import edu.ucdenver.ccp.common.string.StringConstants; -import edu.ucdenver.ccp.common.string.StringUtil; -import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; -import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +/* + * Colorado Computational Pharmacology's common module + * + * Copyright (C) 2012 - 2015 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +import org.apache.log4j.Logger; + +import edu.ucdenver.ccp.common.file.CharacterEncoding; +import edu.ucdenver.ccp.common.file.reader.Line; +import edu.ucdenver.ccp.common.string.RegExPatterns; +import edu.ucdenver.ccp.common.string.StringConstants; +import edu.ucdenver.ccp.common.string.StringUtil; +import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; +import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.RefSnpID; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbHaplotypeId; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbVariantLocationId; -import edu.ucdenver.ccp.identifier.publication.PubMedID; - -public class PharmGkbRelationFileParser extends SingleLineFileRecordReader { - - private static final Logger logger = Logger.getLogger(PharmGkbRelationFileParser.class); - - private static final String HEADER = "Entity1_id\tEntity1_name\tEntity1_type\tEntity2_id\tEntity2_name\tEntity2_type\tEvidence\tAssociation\tPK\tPD\tPMIDs"; - - private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; - - private static final String PHARMGKB_ID_PREFIX = "PA"; - - private static final String REFSNP_ID_PATTERN = "rs\\d+"; - - private static final Object ENTITY_TYPE_HAPLOTYPE = "Haplotype"; - - private static final Object ENTITY_TYPE_VARIANT_LOCATION = "VariantLocation"; - - public PharmGkbRelationFileParser(File dataFile, CharacterEncoding encoding) throws IOException { - super(dataFile, encoding, null); - } - - @Override - protected String getFileHeader() throws IOException { - return readLine().getText(); - } - - @Override - protected String getExpectedFileHeader() throws IOException { - return HEADER; - } - - @Override - protected PharmGkbRelationFileRecord parseRecordFromLine(Line line) { - String[] toks = line.getText().split(RegExPatterns.TAB, -1); - String entity1Name = toks[1]; - String entity1Type = toks[2]; - Set> entity1Id = resolveEntityId(toks[0], entity1Type); - String entity2Name = toks[4]; - String entity2Type = toks[5]; - Set> entity2Id = resolveEntityId(toks[3], entity2Type); - Set evidence = new HashSet(StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, null, - RemoveFieldEnclosures.FALSE)); - String association = toks[7]; - String pk = toks[8]; - String pd = toks[9]; - Collection pmids = getPmids(toks[10]); - return new PharmGkbRelationFileRecord(entity1Id, entity1Name, entity1Type, entity2Id, entity2Name, entity2Type, - evidence, association, pk, pd, pmids, line.getByteOffset(), line.getLineNumber()); - } - - /** - * entities in the relationships.tsv file can be PharmGkb accession IDs (e.g. PA450626), RefSnp - * IDs (e.g. rs10209881), Haplotypes (e.g. CYP2A6*1B10), chromosome positions (e.g. - * chr20:48184659 (hg19)) - * - * @param string - * @return - */ - private Set> resolveEntityId(String idStr, String entityType) { - Set> ids = new HashSet>(); - String[] toks = idStr.split(","); - for (String id : toks) { - if (idStr.startsWith(PHARMGKB_ID_PREFIX)) { - ids.add(new PharmGkbID(id)); - } else if (idStr.matches(REFSNP_ID_PATTERN)) { - ids.add(new RefSnpID(id)); - } else if (entityType.equals(ENTITY_TYPE_HAPLOTYPE)) { - ids.add(new PharmGkbHaplotypeId(id)); - } else if (entityType.equals(ENTITY_TYPE_VARIANT_LOCATION)) { - ids.add(new PharmGkbVariantLocationId(id)); - } else { - ids.add(new UnknownDataSourceIdentifier(id, null)); - } - } - return ids; - - } - - /** - * @param string - * @return - */ - private Collection getPmids(String pmidStr) { - Collection pmids = new ArrayList(); - if (!pmidStr.isEmpty()) { - for (String pmid : pmidStr.split(StringConstants.SEMICOLON)) { - if (pmid.startsWith(StringConstants.COLON)) { - // there are some PMIDs prefixed by a colon in the relationships.tsv file, e.g. - // ":17522595". Here we remove the leading colon if it is present. - pmid = StringUtil.removePrefix(pmid, StringConstants.COLON); - } - if (pmid.startsWith(StringConstants.LEFT_SQUARE_BRACKET)) { - // there are some PMIDs prefixed by a [ in the relationships.tsv file, e.g. - // "[11866883". Here we remove the leading [ if it is present. - pmid = StringUtil.removePrefix(pmid, StringConstants.LEFT_SQUARE_BRACKET); - } - if (pmid.endsWith(StringConstants.RIGHT_SQUARE_BRACKET)) { - // there are some PMIDs suffixed by a } in the relationships.tsv file, e.g. - // "22020825]". Here we remove the trailing ] if it is present. - pmid = StringUtil.removeSuffix(pmid, StringConstants.RIGHT_SQUARE_BRACKET); - } - if (pmid.matches(RegExPatterns.HAS_NUMBERS_ONLY)) { - pmids.add(new PubMedID(pmid)); - } else { - logger.warn("Unhandled PMID format: " + pmid); - } - } - } - return pmids; - } - -} +import edu.ucdenver.ccp.datasource.identifiers.ncbi.RefSnpID; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbHaplotypeId; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbVariantLocationId; +import edu.ucdenver.ccp.identifier.publication.PubMedID; + +public class PharmGkbRelationFileParser extends SingleLineFileRecordReader { + + private static final Logger logger = Logger.getLogger(PharmGkbRelationFileParser.class); + + private static final String HEADER = "Entity1_id\tEntity1_name\tEntity1_type\tEntity2_id\tEntity2_name\tEntity2_type\tEvidence\tAssociation\tPK\tPD\tPMIDs"; + + private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; + + private static final String PHARMGKB_ID_PREFIX = "PA"; + + private static final String REFSNP_ID_PATTERN = "rs\\d+"; + + private static final Object ENTITY_TYPE_HAPLOTYPE = "Haplotype"; + + private static final Object ENTITY_TYPE_VARIANT_LOCATION = "VariantLocation"; + + public PharmGkbRelationFileParser(File dataFile, CharacterEncoding encoding) throws IOException { + super(dataFile, encoding, null); + } + + @Override + protected String getFileHeader() throws IOException { + return readLine().getText(); + } + + @Override + protected String getExpectedFileHeader() throws IOException { + return HEADER; + } + + @Override + protected PharmGkbRelationFileRecord parseRecordFromLine(Line line) { + String[] toks = line.getText().split(RegExPatterns.TAB, -1); + String entity1Name = toks[1]; + String entity1Type = toks[2]; + Set> entity1Id = resolveEntityId(toks[0], entity1Type); + String entity2Name = toks[4]; + String entity2Type = toks[5]; + Set> entity2Id = resolveEntityId(toks[3], entity2Type); + Set evidence = new HashSet(StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, null, + RemoveFieldEnclosures.FALSE)); + String association = toks[7]; + String pk = toks[8]; + String pd = toks[9]; + Collection pmids = getPmids(toks[10]); + return new PharmGkbRelationFileRecord(entity1Id, entity1Name, entity1Type, entity2Id, entity2Name, entity2Type, + evidence, association, pk, pd, pmids, line.getByteOffset(), line.getLineNumber()); + } + + /** + * entities in the relationships.tsv file can be PharmGkb accession IDs (e.g. PA450626), RefSnp + * IDs (e.g. rs10209881), Haplotypes (e.g. CYP2A6*1B10), chromosome positions (e.g. + * chr20:48184659 (hg19)) + * + * @param string + * @return + */ + private Set> resolveEntityId(String idStr, String entityType) { + Set> ids = new HashSet>(); + String[] toks = idStr.split(","); + for (String id : toks) { + if (idStr.startsWith(PHARMGKB_ID_PREFIX)) { + ids.add(new PharmGkbID(id)); + } else if (idStr.matches(REFSNP_ID_PATTERN)) { + ids.add(new RefSnpID(id)); + } else if (entityType.equals(ENTITY_TYPE_HAPLOTYPE)) { + ids.add(new PharmGkbHaplotypeId(id)); + } else if (entityType.equals(ENTITY_TYPE_VARIANT_LOCATION)) { + ids.add(new PharmGkbVariantLocationId(id)); + } else { + ids.add(new UnknownDataSourceIdentifier(id)); + } + } + return ids; + + } + + /** + * @param string + * @return + */ + private Collection getPmids(String pmidStr) { + Collection pmids = new ArrayList(); + if (!pmidStr.isEmpty()) { + for (String pmid : pmidStr.split(StringConstants.SEMICOLON)) { + if (pmid.startsWith(StringConstants.COLON)) { + // there are some PMIDs prefixed by a colon in the relationships.tsv file, e.g. + // ":17522595". Here we remove the leading colon if it is present. + pmid = StringUtil.removePrefix(pmid, StringConstants.COLON); + } + if (pmid.startsWith(StringConstants.LEFT_SQUARE_BRACKET)) { + // there are some PMIDs prefixed by a [ in the relationships.tsv file, e.g. + // "[11866883". Here we remove the leading [ if it is present. + pmid = StringUtil.removePrefix(pmid, StringConstants.LEFT_SQUARE_BRACKET); + } + if (pmid.endsWith(StringConstants.RIGHT_SQUARE_BRACKET)) { + // there are some PMIDs suffixed by a } in the relationships.tsv file, e.g. + // "22020825]". Here we remove the trailing ] if it is present. + pmid = StringUtil.removeSuffix(pmid, StringConstants.RIGHT_SQUARE_BRACKET); + } + if (pmid.matches(RegExPatterns.HAS_NUMBERS_ONLY)) { + pmids.add(new PubMedID(pmid)); + } else { + logger.warn("Unhandled PMID format: " + pmid); + } + } + } + return pmids; + } + +} diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java index 94e8bb2..e09c71d 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java @@ -139,6 +139,6 @@ private DataSourceIdentifier resolveId(String idStr) { } catch (IllegalArgumentException e) { return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java index f477e67..c1e5921 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java @@ -117,7 +117,7 @@ public DataSourceIdentifier resolveId(String idStr) { if (idStr.startsWith("UniProtKB:")) { return new UniProtID(idStr.substring(10)); } - return new UnknownDataSourceIdentifier(idStr, null); + return new UnknownDataSourceIdentifier(idStr); } /* diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java new file mode 100644 index 0000000..d1110d7 --- /dev/null +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/interpro/InterProProtein2IprDatFileParserTest.java @@ -0,0 +1,133 @@ + +/* + * Colorado Computational Pharmacology's datasource module + * + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +package edu.ucdenver.ccp.datasource.fileparsers.interpro; + +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +import edu.ucdenver.ccp.common.file.CharacterEncoding; +import edu.ucdenver.ccp.common.file.reader.Line; +import edu.ucdenver.ccp.datasource.fileparsers.ebi.interpro.InterProProtein2IprDatFileData; +import edu.ucdenver.ccp.datasource.fileparsers.ebi.interpro.InterProProtein2IprDatFileParser; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; + +import org.junit.Test; + +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThat; +import static org.hamcrest.core.IsInstanceOf.instanceOf; + +public class InterProProtein2IprDatFileParserTest { + + @Test + public void parseLineWithUnknownExternalId() + throws Exception + { + InterProProtein2IprDatFileData dat = + new TestParser().parseRecordFromLine( + makeLine( + "A0A004\tIPR001962\tAsparagine synthase\tcd01991\t241\t565")); + // Doesn't throw an error for unknown IDs. + assertNotNull(dat); + // External ID handled correctly. + assertThat(dat.getExternalReference(), + instanceOf(UnknownDataSourceIdentifier.class)); + } + + /* --------------------------------------------------------------------- */ + + static class TestParser extends InterProProtein2IprDatFileParser + { + TestParser() + throws IOException + { + super(makeTempFile(), CharacterEncoding.US_ASCII); + } + + private static File makeTempFile() + throws IOException + { + File f = File.createTempFile("protein2ipr", null); + f.deleteOnExit(); + return f; + } + + public InterProProtein2IprDatFileData parseRecordFromLine(Line l) { + return super.parseRecordFromLine(l); + } + } + + private Line makeLine(String textSansTerminator) { + return new Line(textSansTerminator, + Line.LineTerminator.CR, + 0, // character offset + 0, // code point offset + 1, // line number + -1); // byte offset + } +} diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java index 8c9f3f9..e65f8d9 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java @@ -109,16 +109,24 @@ private void validateRecord(ProMappingRecord record, ProteinOntologyId expectedP } private void validateRecord1(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new HgncID("HGNC:11773")); + validateRecord(record, + new ProteinOntologyId("PR:000000005"), + "is_a", + new HgncID("HGNC:11773")); } private void validateRecord2(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new UnknownDataSourceIdentifier( - "UniProtKB_VAR:VAR_022359", null)); + validateRecord(record, + new ProteinOntologyId("PR:000000005"), + "is_a", + new UnknownDataSourceIdentifier("UniProtKB_VAR:VAR_022359")); } private void validateRecord3(ProMappingRecord record) { - validateRecord(record, new ProteinOntologyId("PR:000000006"), "exact", new UniProtID("P37173")); + validateRecord(record, + new ProteinOntologyId("PR:000000006"), + "exact", + new UniProtID("P37173")); } } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java index 6b0fddd..cc4d6c0 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java @@ -452,7 +452,7 @@ else if (geneIDStr.startsWith("NCBITaxon:")) return new NcbiTaxonomyID(StringUtil.removePrefix(geneIDStr, "NCBITaxon:")); logger.warn(String.format("Unhandled gene ID format: %s. Creating UnknownDataSourceIdentifier.", geneIDStr)); - return new UnknownDataSourceIdentifier(geneIDStr, null); + return new UnknownDataSourceIdentifier(geneIDStr); } catch (IllegalArgumentException e) { logger.warn("Invalid ID detected... " + e.getMessage()); return new ProbableErrorDataSourceIdentifier(geneIDStr, null, e.getMessage()); @@ -481,7 +481,7 @@ private static DataSourceIdentifier resolveInteractionID(String interactionID logger.warn(String.format("Unknown interaction ID format: %s. Cannot create DataElementIdentifier.", interactionIDStr)); - return new UnknownDataSourceIdentifier(interactionIDStr, null); + return new UnknownDataSourceIdentifier(interactionIDStr); } /** diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java index 14a91db..11c26da 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java @@ -34,6 +34,38 @@ * #L% */ +/* + * Colorado Computational Pharmacology's datasource project + * + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + public class UnknownDataSourceIdentifier extends DataSourceIdentifier { private final String dataSourceStr; @@ -43,6 +75,17 @@ public UnknownDataSourceIdentifier(String resourceID, String dataSourceStr) { this.dataSourceStr = dataSourceStr; } + /** + * Constructor that may be used when a canonical identifier for the source + * in which the resource identifier is defined is not available. + * + * @param resourceID + * The identifier for the resource in the external, unknown source. + */ + public UnknownDataSourceIdentifier(String resourceID) { + this(resourceID, null); + } + @Override public String validate(String resourceID) throws IllegalArgumentException { return resourceID;