From 5039cd78c36a0a3c103b8ba409319a0754d733cd Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 25 Jan 2016 15:57:44 -0700 Subject: [PATCH 01/36] updating poms for 0.7-SNAPSHOT development --- datasource-fileparsers/pom.xml | 5 ++--- datasource-identifiers/pom.xml | 5 ++--- datasource-rdfizer/pom.xml | 5 ++--- pom.xml | 7 +++---- 4 files changed, 9 insertions(+), 13 deletions(-) diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml index 67f1dc4..a1eee00 100644 --- a/datasource-fileparsers/pom.xml +++ b/datasource-fileparsers/pom.xml @@ -1,10 +1,9 @@ - + 4.0.0 datasource edu.ucdenver.ccp - 0.6-SNAPSHOT + 0.7-SNAPSHOT datasource-fileparsers diff --git a/datasource-identifiers/pom.xml b/datasource-identifiers/pom.xml index 3a3aad5..ef3917f 100644 --- a/datasource-identifiers/pom.xml +++ b/datasource-identifiers/pom.xml @@ -1,10 +1,9 @@ - + 4.0.0 datasource edu.ucdenver.ccp - 0.6-SNAPSHOT + 0.7-SNAPSHOT datasource-identifiers diff --git a/datasource-rdfizer/pom.xml b/datasource-rdfizer/pom.xml index 9206d25..1e4b68f 100644 --- a/datasource-rdfizer/pom.xml +++ b/datasource-rdfizer/pom.xml @@ -1,10 +1,9 @@ - + 4.0.0 edu.ucdenver.ccp datasource - 0.6-SNAPSHOT + 0.7-SNAPSHOT datasource-rdfizer diff --git a/pom.xml b/pom.xml index 47e458e..3e6cc6f 100644 --- a/pom.xml +++ b/pom.xml @@ -1,9 +1,8 @@ - + 4.0.0 edu.ucdenver.ccp datasource - 0.6-SNAPSHOT + 0.7-SNAPSHOT pom @@ -210,7 +209,7 @@ - + From c4fef4e27ce9d28a9d52b4f518cdaa9f66defb1b Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 25 Jan 2016 16:20:41 -0700 Subject: [PATCH 02/36] updating develop poms to master versions to avoid merge conflicts --- datasource-fileparsers/pom.xml | 2 +- datasource-identifiers/pom.xml | 2 +- datasource-rdfizer/pom.xml | 2 +- pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml index a1eee00..49ec869 100644 --- a/datasource-fileparsers/pom.xml +++ b/datasource-fileparsers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.7-SNAPSHOT + 0.6 datasource-fileparsers diff --git a/datasource-identifiers/pom.xml b/datasource-identifiers/pom.xml index ef3917f..a6ea21d 100644 --- a/datasource-identifiers/pom.xml +++ b/datasource-identifiers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.7-SNAPSHOT + 0.6 datasource-identifiers diff --git a/datasource-rdfizer/pom.xml b/datasource-rdfizer/pom.xml index 1e4b68f..a57828b 100644 --- a/datasource-rdfizer/pom.xml +++ b/datasource-rdfizer/pom.xml @@ -3,7 +3,7 @@ edu.ucdenver.ccp datasource - 0.7-SNAPSHOT + 0.6 datasource-rdfizer diff --git a/pom.xml b/pom.xml index 3e6cc6f..68b8d29 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 edu.ucdenver.ccp datasource - 0.7-SNAPSHOT + 0.6 pom From 48974a2c8655f76fb4d67c53873b12a537e8ad13 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 25 Jan 2016 16:20:43 -0700 Subject: [PATCH 03/36] Updating develop poms back to pre merge state --- datasource-fileparsers/pom.xml | 2 +- datasource-identifiers/pom.xml | 2 +- datasource-rdfizer/pom.xml | 2 +- pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml index 7c1441d..55a633d 100644 --- a/datasource-fileparsers/pom.xml +++ b/datasource-fileparsers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.6 + 0.7-SNAPSHOT datasource-fileparsers diff --git a/datasource-identifiers/pom.xml b/datasource-identifiers/pom.xml index a6ea21d..ef3917f 100644 --- a/datasource-identifiers/pom.xml +++ b/datasource-identifiers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.6 + 0.7-SNAPSHOT datasource-identifiers diff --git a/datasource-rdfizer/pom.xml b/datasource-rdfizer/pom.xml index a57828b..1e4b68f 100644 --- a/datasource-rdfizer/pom.xml +++ b/datasource-rdfizer/pom.xml @@ -3,7 +3,7 @@ edu.ucdenver.ccp datasource - 0.6 + 0.7-SNAPSHOT datasource-rdfizer diff --git a/pom.xml b/pom.xml index 68b8d29..3e6cc6f 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 edu.ucdenver.ccp datasource - 0.6 + 0.7-SNAPSHOT pom From 2af943d40d9909f8a79eec9f9754e8ba51ddb020 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 25 Jan 2016 16:40:58 -0700 Subject: [PATCH 04/36] Removed project.version property This was previously used to set the submodule versions, however is no longer necessary due to the adoption of the jgitflow-maven-plugin --- pom.xml | 1 - 1 file changed, 1 deletion(-) diff --git a/pom.xml b/pom.xml index 3e6cc6f..dd3e590 100644 --- a/pom.xml +++ b/pom.xml @@ -9,7 +9,6 @@ UTF-8 - 0.6-SNAPSHOT From ea51a62a9edbf3cb25fc40cb7aade5d7df3608fa Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Thu, 28 Jan 2016 13:16:42 -0700 Subject: [PATCH 05/36] updated pom versions in scripts --- datasource-rdfizer/scripts/pom-rdf-gen-9606.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-gen-ids.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-gen.xml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml b/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml index 92156f6..da7eabd 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml @@ -4,7 +4,7 @@ edu.ucdenver.ccp datasource-rdfizer-rdf-gen pom - 0.6-SNAPSHOT + 0.7-SNAPSHOT UTF-8 @@ -14,7 +14,7 @@ edu.ucdenver.ccp datasource-rdfizer - 0.6-SNAPSHOT + 0.7-SNAPSHOT jar compile diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-ids.xml b/datasource-rdfizer/scripts/pom-rdf-gen-ids.xml index 06aff1f..532f61d 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-ids.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-ids.xml @@ -6,7 +6,7 @@ edu.ucdenver.ccp datasource-rdfizer-rdf-gen-ids pom - 0.6-SNAPSHOT + 0.7-SNAPSHOT UTF-8 @@ -16,7 +16,7 @@ edu.ucdenver.ccp datasource-rdfizer - 0.6-SNAPSHOT + 0.7-SNAPSHOT jar compile diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml index 009bdda..bd4c1bf 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml @@ -4,7 +4,7 @@ edu.ucdenver.ccp datasource-rdfizer-rdf-gen pom - 0.6-SNAPSHOT + 0.7-SNAPSHOT UTF-8 @@ -14,7 +14,7 @@ edu.ucdenver.ccp datasource-rdfizer - 0.6-SNAPSHOT + 0.7-SNAPSHOT jar compile diff --git a/datasource-rdfizer/scripts/pom-rdf-gen.xml b/datasource-rdfizer/scripts/pom-rdf-gen.xml index 29cacb9..0ab015d 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen.xml @@ -4,7 +4,7 @@ edu.ucdenver.ccp datasource-rdfizer-rdf-gen pom - 0.6-SNAPSHOT + 0.7-SNAPSHOT UTF-8 @@ -14,7 +14,7 @@ edu.ucdenver.ccp datasource-rdfizer - 0.6-SNAPSHOT + 0.7-SNAPSHOT jar compile From af7973f5fac4c521900644bf4975ef2b7f48b030 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 1 Feb 2016 09:42:14 -0700 Subject: [PATCH 06/36] Now uses proper BFO/RO identifiers Swapped out a few human-readable property names for their appropriate BFO/RO identifiers --- .../ccp/datasource/rdfizer/rdf/vocabulary/IAO.java | 3 +-- .../ccp/datasource/rdfizer/rdf/vocabulary/RO.java | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/IAO.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/IAO.java index 2abf3a2..60b7bb2 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/IAO.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/IAO.java @@ -45,8 +45,7 @@ */ public enum IAO { - MENTIONS("mentions"), - //DENOTES("denotes"); + MENTIONS("IAO_0000142"), DENOTES("IAO_0000219"), INFORMATION_CONTENT_ENITITY("IAO_0000030"); diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/RO.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/RO.java index 7b0ed6c..57ec4fa 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/RO.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/vocabulary/RO.java @@ -45,10 +45,10 @@ */ public enum RO { - LOCATED_IN("located_in"), - PART_OF("part_of"), - HAS_PART("has_part"), - HAS_PARTICIPANT("has_participant"); + LOCATED_IN("RO_0001025"), + PART_OF("BFO_0000050"), + HAS_PART("BFO_0000051"), + HAS_PARTICIPANT("RO_0000057"); private final String termName; From 82f660e845857d1b7d9800f14139f4620d3325df Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 1 Feb 2016 09:47:44 -0700 Subject: [PATCH 07/36] Test cases now reflect use of RO/BFO identifiers --- .../rdfizer/rdf/ice/RdfRecordUtilTest.java | 36 +++++------ .../rdf/ice/RdfRecordWriterImplTest.java | 36 +++++------ .../rdfizer/rdf/ice/RecordUtilTest.java | 60 +++++++++---------- .../rdfizer/rdf/ice/SubRecordUtilTest.java | 50 ++++++++-------- 4 files changed, 91 insertions(+), 91 deletions(-) diff --git a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtilTest.java b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtilTest.java index c424404..16c1061 100644 --- a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtilTest.java +++ b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtilTest.java @@ -148,8 +148,8 @@ private static class TestDataRecordWithNestedSubRecordCollection extends TestDat public void testGetRecordSchemaStatements_WithSubRecordField() { Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Field)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Schema)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Field)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://purl.obolibrary.org/obo/IAO_0000030)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", @@ -157,33 +157,33 @@ public void testGetRecordSchemaStatements_WithSubRecordField() { "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Schema)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"collection field\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"primitive int field\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://kabob.ucdenver.edu/iao/hasKeyPart, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#comment, \"test comment for TestDataRecord.stringField\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"string field\"@en)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Field)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Schema)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Field)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://purl.obolibrary.org/obo/IAO_0000030)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/2000/01/rdf-schema#comment, \"This is a sub-record class\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/2000/01/rdf-schema#label, \"sub record\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"sub string field\"@en)"); @@ -207,8 +207,8 @@ public void testGetRecordSchemaStatements_WithSubRecordField() { public void testGetRecordSchemaStatements_WithSubRecordCollectionField() { Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Field)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Schema)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Field)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://purl.obolibrary.org/obo/IAO_0000030)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", @@ -216,33 +216,33 @@ public void testGetRecordSchemaStatements_WithSubRecordCollectionField() { "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Schema)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"collection field\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"primitive int field\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://kabob.ucdenver.edu/iao/hasKeyPart, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#comment, \"test comment for TestDataRecord.stringField\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"string field\"@en)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/Field)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Schema)", + "(http://kabob.ucdenver.edu/iao/Schema, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/Field)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://purl.obolibrary.org/obo/IAO_0000030)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/2000/01/rdf-schema#comment, \"This is a sub-record class\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://www.w3.org/2000/01/rdf-schema#label, \"sub record\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Schema)", - "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", + "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/Field)", - "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", + "(http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://purl.org/dc/terms/hasVersion, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1, http://www.w3.org/2000/01/rdf-schema#label, \"sub string field\"@en)"); diff --git a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplTest.java b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplTest.java index ecc9c82..266fca9 100644 --- a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplTest.java +++ b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplTest.java @@ -180,68 +180,68 @@ private List getExpectedLines() { " .", " .", - " .", + " .", " .", " \"2010-12-17T00:00:00.000-07:00\"^^ .", - " .", + " .", " .", " .", - " .", + " .", " .", " .", " \"1\"^^ .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", " \"ABC-1\"@en .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", - " .", + " .", " .", " .", " \"2\"^^ .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", " \"DEF-2\"@en .", - " .", - " .", + " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", - " .", + " .", " .", " .", " \"3\"^^ .", - " .", + " .", " .", " .", " .", - " .", + " .", " .", " .", " \"XYZ-9\"@en ."); diff --git a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RecordUtilTest.java b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RecordUtilTest.java index 49fd076..2f737c7 100644 --- a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RecordUtilTest.java +++ b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RecordUtilTest.java @@ -173,11 +173,11 @@ public int hashCode() { // "(http://kabob.ucdenver.edu/iao/kegg/keggDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/DataField)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1)"); + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1)"); // // List statements = // RdfRecordUtil.getRecordSchemaDefinitionStatements(TestDataRecord.class); @@ -201,17 +201,17 @@ public int hashCode() { // "(http://kabob.ucdenver.edu/iao/kegg/keggDataField, http://www.w3.org/2000/01/rdf-schema#subClassOf, http://kabob.ucdenver.edu/iao/DataField)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_primitiveIntFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_primitiveIntFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_primitiveIntFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_primitiveIntFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_stringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_stringFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_stringFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_stringFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord_subrecordStringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord_subrecordStringFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord_subrecordStringFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord_subrecordStringFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_subRecordDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_subRecordDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_subRecordDataField1)"); + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_subRecordDataField1)"); // // Collection statements = RdfRecordUtil // .getRecordSchemaStatements(TestDataRecordWithSubrecord.class, null, null, false); @@ -232,11 +232,11 @@ public int hashCode() { // .createSet( // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_collectionFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1)", + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_primitiveIntFieldDataField1)", // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField)", - // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1)"); + // "(http://kabob.ucdenver.edu/iao/kegg/TestDataRecordSchema1, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/TestDataRecord_stringFieldDataField1)"); // // Collection statements = // RdfRecordUtil.getRecordFieldDeclarationStatements( @@ -265,7 +265,7 @@ public final void testGetDataSourceInstanceStatements() { "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordDataSet20101221, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordSchema1)", it.next().toString()); assertEquals( - "(http://kabob.ucdenver.edu/iao/kegg/keggDataSource20101221, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordDataSet20101221)", + "(http://kabob.ucdenver.edu/iao/kegg/keggDataSource20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordDataSet20101221)", it.next().toString()); assertEquals( "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordDataSet20101221, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/DataSet)", @@ -283,22 +283,22 @@ public final void testGetRecordInstanceStatements() throws URISyntaxException { Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestExcludeFieldDataRecordDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestExcludeFieldDataRecordDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecordSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)"); @@ -323,29 +323,29 @@ public final void testGetRecordInstanceStatements_WithSubRecord() throws URISynt Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubrecordDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubrecordDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecordSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_DnIjgkTYdCorX3kHXYaWJhzcHJY)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_DnIjgkTYdCorX3kHXYaWJhzcHJY)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_DnIjgkTYdCorX3kHXYaWJhzcHJY, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_DnIjgkTYdCorX3kHXYaWJhzcHJY, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_DnIjgkTYdCorX3kHXYaWJhzcHJY, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_L5wUWixEQDDNsXhJedMS4OBYRm0)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_L5wUWixEQDDNsXhJedMS4OBYRm0)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_L5wUWixEQDDNsXhJedMS4OBYRm0, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_L5wUWixEQDDNsXhJedMS4OBYRm0, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_collectionField_L5wUWixEQDDNsXhJedMS4OBYRm0, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_primitiveIntField_6OTcnDKpkfNEWVrrykNM-emmlhk)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_primitiveIntField_6OTcnDKpkfNEWVrrykNM-emmlhk)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_primitiveIntField_6OTcnDKpkfNEWVrrykNM-emmlhk, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_primitiveIntField_6OTcnDKpkfNEWVrrykNM-emmlhk, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_primitiveIntField_6OTcnDKpkfNEWVrrykNM-emmlhk, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_stringField_jw2tCkX01xySTT2rYV5FCpiqRw0)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_stringField_jw2tCkX01xySTT2rYV5FCpiqRw0)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_stringField_jw2tCkX01xySTT2rYV5FCpiqRw0, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubrecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_stringField_jw2tCkX01xySTT2rYV5FCpiqRw0, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubrecord_stringField_jw2tCkX01xySTT2rYV5FCpiqRw0, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU)", "(http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord)", "(http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecordSchema1)", - "(http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataSubRecord_subrecordStringField_7VVwQkWKCGaD8uZQ2SMe2RFUNeU)", + "(http://kabob.ucdenver.edu/iao/kegg/R_TestDataSubRecord_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataSubRecord_subrecordStringField_7VVwQkWKCGaD8uZQ2SMe2RFUNeU)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataSubRecord_subrecordStringField_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataSubRecord_subrecordStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataSubRecord_subrecordStringField_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataSubRecord_subrecordStringField_7VVwQkWKCGaD8uZQ2SMe2RFUNeU, http://purl.obolibrary.org/obo/IAO_0000219, \"sub\"@en)"); @@ -370,22 +370,22 @@ public final void testGetRecordInstanceStatementsWithRecordKey() throws URISynta Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestExcludeFieldDataRecordKeyDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestExcludeFieldDataRecordKeyDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecordSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_O0waxDNfQT_lFeO4grbgwHJ_bxs, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_collectionField_oHpC_sn17AbL7y86SQEkK6oZqgA, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_primitiveIntField_K1E3g8ozXlcQiV3vtNaH7ikWY5I, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestExcludeFieldDataRecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestExcludeFieldDataRecord_stringField_wM4OI6HAehQ_w0UDN6cjfEXbpXg, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)"); diff --git a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/SubRecordUtilTest.java b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/SubRecordUtilTest.java index dfd1f21..acf12d2 100644 --- a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/SubRecordUtilTest.java +++ b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/SubRecordUtilTest.java @@ -163,29 +163,29 @@ public final void testGetRecordInstanceStatementsWithSubRecord() throws URISynta Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubRecordDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubRecordDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_bxwod_CldpkRuVLuKbP0T5IQ6JQ)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_bxwod_CldpkRuVLuKbP0T5IQ6JQ)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_bxwod_CldpkRuVLuKbP0T5IQ6JQ, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_bxwod_CldpkRuVLuKbP0T5IQ6JQ, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_bxwod_CldpkRuVLuKbP0T5IQ6JQ, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_Rft4iXphN4pTKZIY-174Yxb3mcA)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_Rft4iXphN4pTKZIY-174Yxb3mcA)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_Rft4iXphN4pTKZIY-174Yxb3mcA, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_Rft4iXphN4pTKZIY-174Yxb3mcA, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_collectionField_Rft4iXphN4pTKZIY-174Yxb3mcA, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_primitiveIntField_hmNPO2pDKLqbj8jYanuGe3fDEro)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_primitiveIntField_hmNPO2pDKLqbj8jYanuGe3fDEro)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_primitiveIntField_hmNPO2pDKLqbj8jYanuGe3fDEro, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_primitiveIntField_hmNPO2pDKLqbj8jYanuGe3fDEro, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_primitiveIntField_hmNPO2pDKLqbj8jYanuGe3fDEro, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_stringField_cK1-ZKY-VbQR72YjJQpeLtNdm34)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_stringField_cK1-ZKY-VbQR72YjJQpeLtNdm34)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_stringField_cK1-ZKY-VbQR72YjJQpeLtNdm34, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_stringField_cK1-ZKY-VbQR72YjJQpeLtNdm34, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecord_stringField_cK1-ZKY-VbQR72YjJQpeLtNdm34, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecord)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", - "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws)", + "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/IAO_0000219, \"XYZZZZZZZZ\"@en)"); @@ -213,36 +213,36 @@ public final void testGetRecordInstanceStatementsWithCollectionSubRecord() throw Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubRecordCollectionDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithSubRecordCollectionDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollectionSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_ActAF6W_MAU_W5CKGjJQDpVjRYs)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_ActAF6W_MAU_W5CKGjJQDpVjRYs)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_ActAF6W_MAU_W5CKGjJQDpVjRYs, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_ActAF6W_MAU_W5CKGjJQDpVjRYs, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_ActAF6W_MAU_W5CKGjJQDpVjRYs, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_h0X8Bargx3OrweU69XvjQp6FneE)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_h0X8Bargx3OrweU69XvjQp6FneE)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_h0X8Bargx3OrweU69XvjQp6FneE, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_h0X8Bargx3OrweU69XvjQp6FneE, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_collectionField_h0X8Bargx3OrweU69XvjQp6FneE, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_primitiveIntField_ASovVMTKCova71bDzai_hBIv1Ek)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_primitiveIntField_ASovVMTKCova71bDzai_hBIv1Ek)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_primitiveIntField_ASovVMTKCova71bDzai_hBIv1Ek, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_primitiveIntField_ASovVMTKCova71bDzai_hBIv1Ek, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_primitiveIntField_ASovVMTKCova71bDzai_hBIv1Ek, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_stringField_fuBM1_QOxFmmz2fqYeEdeqb9eoo)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_stringField_fuBM1_QOxFmmz2fqYeEdeqb9eoo)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_stringField_fuBM1_QOxFmmz2fqYeEdeqb9eoo, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithSubRecordCollection_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_stringField_fuBM1_QOxFmmz2fqYeEdeqb9eoo, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithSubRecordCollection_stringField_fuBM1_QOxFmmz2fqYeEdeqb9eoo, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecord)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", - "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws)", + "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_h8DLlrIUz8gORKdbauuMlahhVws, http://purl.obolibrary.org/obo/IAO_0000219, \"XYZZZZZZZZ\"@en)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/SubRecord)", "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecordSchema1)", - "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_YrTvhKIp4LTfUjgPWrmt54QkIPk)", + "(http://kabob.ucdenver.edu/iao/kegg/R_SubRecord_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_YrTvhKIp4LTfUjgPWrmt54QkIPk)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/SubRecord_subStringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_SubRecord_subStringField_YrTvhKIp4LTfUjgPWrmt54QkIPk, http://purl.obolibrary.org/obo/IAO_0000219, \"ABABABABA\"@en)"); @@ -273,36 +273,36 @@ public final void testGetRecordInstanceStatementsWithNestedSubRecord() throws UR Set expectedStatements = CollectionsUtil .createSet( - "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithNestedSubRecordDataSet20101221, http://purl.obolibrary.org/obo/has_part, http://record.uri)", + "(http://kabob.ucdenver.edu/iao/kegg/keggTestDataRecordWithNestedSubRecordDataSet20101221, http://purl.obolibrary.org/obo/BFO_0000051, http://record.uri)", "(http://record.uri, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecord)", "(http://record.uri, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecordSchema1)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_SG1QsGZwhJNaTdMnzgma3v5AB24)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_SG1QsGZwhJNaTdMnzgma3v5AB24)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_SG1QsGZwhJNaTdMnzgma3v5AB24, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_SG1QsGZwhJNaTdMnzgma3v5AB24, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_SG1QsGZwhJNaTdMnzgma3v5AB24, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_wICWjHJMs-mggQ_vE6Jc0mnb2As)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_wICWjHJMs-mggQ_vE6Jc0mnb2As)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_wICWjHJMs-mggQ_vE6Jc0mnb2As, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"@en)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_wICWjHJMs-mggQ_vE6Jc0mnb2As, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecord_collectionFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_collectionField_wICWjHJMs-mggQ_vE6Jc0mnb2As, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU)", "(http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/kegg/NestedSubRecord)", "(http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/NestedSubRecordSchema1)", - "(http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0)", + "(http://kabob.ucdenver.edu/iao/kegg/R_NestedSubRecord_QrM2DZEWgi_Lp-In-_vZnXBaPWU, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0)", "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/irefweb/IRefWebInteractionSourceDatabase)", "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/irefweb/IRefWebInteractionSourceDatabaseSchema1)", - "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseId_hxLOHMRgT97VZ1vytl4H1dILsuc)", + "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseId_hxLOHMRgT97VZ1vytl4H1dILsuc)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseId_hxLOHMRgT97VZ1vytl4H1dILsuc, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/irefweb/IRefWebInteractionSourceDatabase_sourceDatabaseIdDataField1)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseId_hxLOHMRgT97VZ1vytl4H1dILsuc, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseId_hxLOHMRgT97VZ1vytl4H1dILsuc, http://purl.obolibrary.org/obo/IAO_0000219, http://kabob.ucdenver.edu/iao/mi_ontology/MI_ONTOLOGY_MI_0123_ICE)", - "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseName_8f_AI3EeN_eQ7M4Y6Ds14YrZmcU)", + "(http://kabob.ucdenver.edu/iao/irefweb/R_IRefWebInteractionSourceDatabase_IPNOgWa085q7R1Ww21fz-xD4MV0, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseName_8f_AI3EeN_eQ7M4Y6Ds14YrZmcU)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseName_8f_AI3EeN_eQ7M4Y6Ds14YrZmcU, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/irefweb/IRefWebInteractionSourceDatabase_sourceDatabaseNameDataField1)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseName_8f_AI3EeN_eQ7M4Y6Ds14YrZmcU, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/irefweb/F_IRefWebInteractionSourceDatabase_sourceDatabaseName_8f_AI3EeN_eQ7M4Y6Ds14YrZmcU, http://purl.obolibrary.org/obo/IAO_0000219, \"miTerm123\"@en)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_primitiveIntField_hzUY3pYkyLsIgSjagFn909vXijg)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_primitiveIntField_hzUY3pYkyLsIgSjagFn909vXijg)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_primitiveIntField_hzUY3pYkyLsIgSjagFn909vXijg, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecord_primitiveIntFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_primitiveIntField_hzUY3pYkyLsIgSjagFn909vXijg, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_primitiveIntField_hzUY3pYkyLsIgSjagFn909vXijg, http://purl.obolibrary.org/obo/IAO_0000219, \"2\"^^)", - "(http://record.uri, http://purl.obolibrary.org/obo/has_part, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_stringField_QEauZ8dFR2e9ZMcNoVMBLPRhFb8)", + "(http://record.uri, http://purl.obolibrary.org/obo/BFO_0000051, http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_stringField_QEauZ8dFR2e9ZMcNoVMBLPRhFb8)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_stringField_QEauZ8dFR2e9ZMcNoVMBLPRhFb8, http://kabob.ucdenver.edu/iao/hasTemplate, http://kabob.ucdenver.edu/iao/kegg/TestDataRecordWithNestedSubRecord_stringFieldDataField1)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_stringField_QEauZ8dFR2e9ZMcNoVMBLPRhFb8, http://www.w3.org/1999/02/22-rdf-syntax-ns#type, http://kabob.ucdenver.edu/iao/FieldValue)", "(http://kabob.ucdenver.edu/iao/kegg/F_TestDataRecordWithNestedSubRecord_stringField_QEauZ8dFR2e9ZMcNoVMBLPRhFb8, http://purl.obolibrary.org/obo/IAO_0000219, \"1\"@en)"); From 6732dcf968052909132c03bc83e06b6826b21b06 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Fri, 5 Feb 2016 16:07:36 -0700 Subject: [PATCH 08/36] Commented out DIP as it is part of IRefWeb --- .../rdfizer/rdf/ice/FileDataSource.java | 53 ++++++++++--------- 1 file changed, 29 insertions(+), 24 deletions(-) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java index b7356aa..33b43ec 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java @@ -102,30 +102,35 @@ */ public enum FileDataSource { - /** - * The DIP data file must be obtained manually. It is assumed to already be - * in place when RDF generation commences. It must be the only file in the - * DIP data source directory. - * - */ - DIP(DataSource.DIP) { - - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - logger.info("sourceFileDirectory (exists): (" + sourceFileDirectory.exists() + ")" + sourceFileDirectory); - logger.info("file listing: " + Arrays.toString(sourceFileDirectory.listFiles())); - File dipDataFile = sourceFileDirectory.listFiles()[0]; - logger.info("File exists: " + dipDataFile.exists() + " -- " + dipDataFile.getAbsolutePath()); - FileUtil.validateFile(dipDataFile); - return new DipYYYYMMDDFileParser(dipDataFile, CharacterEncoding.US_ASCII, taxonIds); - } - - @Override - protected boolean isTaxonAware() { - return true; - } - }, + + /* + * DIP is now part of IRefWeb, so it has been commented out since it requires the extra manual step + * of logging in to the DIP website and downloading the file (and IRefWeb does not). + */ +// /** +// * The DIP data file must be obtained manually. It is assumed to already be +// * in place when RDF generation commences. It must be the only file in the +// * DIP data source directory. +// * +// */ +// DIP(DataSource.DIP) { +// +// @Override +// protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, +// File idListDir, Set taxonIds) throws IOException { +// logger.info("sourceFileDirectory (exists): (" + sourceFileDirectory.exists() + ")" + sourceFileDirectory); +// logger.info("file listing: " + Arrays.toString(sourceFileDirectory.listFiles())); +// File dipDataFile = sourceFileDirectory.listFiles()[0]; +// logger.info("File exists: " + dipDataFile.exists() + " -- " + dipDataFile.getAbsolutePath()); +// FileUtil.validateFile(dipDataFile); +// return new DipYYYYMMDDFileParser(dipDataFile, CharacterEncoding.US_ASCII, taxonIds); +// } +// +// @Override +// protected boolean isTaxonAware() { +// return true; +// } +// }, /** * The HPRD HPRD_ID_MAPPINGS.txt file must be obtained manually. It is * assumed to already be in place when RDF generation commences. From d115440aa5521fe4f3c5960be6ae33f704b97318 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Fri, 5 Feb 2016 16:11:48 -0700 Subject: [PATCH 09/36] Moved manually obtained datasources to the end This will simplify the RDF generation procedure --- .../rdfizer/rdf/ice/FileDataSource.java | 204 +++++++++--------- 1 file changed, 105 insertions(+), 99 deletions(-) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java index 33b43ec..e50597d 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java @@ -131,78 +131,7 @@ public enum FileDataSource { // return true; // } // }, - /** - * The HPRD HPRD_ID_MAPPINGS.txt file must be obtained manually. It is - * assumed to already be in place when RDF generation commences. - */ - HPRD_ID_MAPPINGS(DataSource.HPRD) { - - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - File hprdIdMappingFile = new File(sourceFileDirectory, - HprdIdMappingsTxtFileParser.HPRD_ID_MAPPINGS_TXT_FILE_NAME); - FileUtil.validateFile(hprdIdMappingFile); - return new HprdIdMappingsTxtFileParser(hprdIdMappingFile, CharacterEncoding.US_ASCII); - } - @Override - protected boolean isTaxonAware() { - return false; - } - }, - /** - * The TRANSFAC gene.dat and matrix.dat files must be obtained manually. - * They are assumed to already be in place when RDF generation commences. - */ - TRANSFAC_GENE(DataSource.TRANSFAC) { - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - File transfacGeneDatFile = new File(sourceFileDirectory, TransfacGeneDatFileParser.GENE_DAT_FILE_NAME); - FileUtil.validateFile(transfacGeneDatFile); - return new TransfacGeneDatFileParser(transfacGeneDatFile, CharacterEncoding.ISO_8859_1); - } - - @Override - protected boolean isTaxonAware() { - return false; - } - }, - - TRANSFAC_MATRIX(DataSource.TRANSFAC) { - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - File transfacMatrixDatFile = new File(sourceFileDirectory, TransfacMatrixDatFileParser.MATRIX_DAT_FILE_NAME); - FileUtil.validateFile(transfacMatrixDatFile); - return new TransfacMatrixDatFileParser(transfacMatrixDatFile, CharacterEncoding.ISO_8859_1); - } - - @Override - protected boolean isTaxonAware() { - return false; - } - }, - /** - * The GAD all.txt data file must be obtained manually. It is assumed to - * already be in place when RDF generation commences. - */ - GAD(DataSource.GAD) { - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - File gadAllTxtFile = new File(sourceFileDirectory, - GeneticAssociationDbAllTxtFileParser.GAD_ALL_TXT_FILE_NAME); - FileUtil.validateFile(gadAllTxtFile); - return new GeneticAssociationDbAllTxtFileParser(gadAllTxtFile, CharacterEncoding.US_ASCII); - } - - @Override - protected boolean isTaxonAware() { - return false; - } - }, /** * */ @@ -232,19 +161,7 @@ protected boolean isTaxonAware() { } }, - PHARMGKB_RELATION(DataSource.PHARMGKB) { - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - File pharmgkbRelationshipsDataFile = new File(sourceFileDirectory, "relationships.tsv"); - return new PharmGkbRelationFileParser(pharmgkbRelationshipsDataFile, CharacterEncoding.UTF_8); - } - @Override - protected boolean isTaxonAware() { - return false; - } - }, PHARMGKB_DRUG(DataSource.PHARMGKB) { @Override @@ -409,21 +326,7 @@ protected boolean isTaxonAware() { return false; } }, - /** - * - */ - OMIM(DataSource.OMIM) { - @Override - protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, - File idListDir, Set taxonIds) throws IOException { - return new OmimTxtFileParser(sourceFileDirectory, cleanSourceFiles); - } - - @Override - protected boolean isTaxonAware() { - return false; - } - }, + /** * */ @@ -693,6 +596,7 @@ protected boolean isTaxonAware() { return true; } }, + /** * @@ -732,7 +636,109 @@ protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boo protected boolean isTaxonAware() { return true; } - }; + }, + + /** + * The HPRD HPRD_ID_MAPPINGS.txt file must be obtained manually. It is + * assumed to already be in place when RDF generation commences. + */ + HPRD_ID_MAPPINGS(DataSource.HPRD) { + + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + File hprdIdMappingFile = new File(sourceFileDirectory, + HprdIdMappingsTxtFileParser.HPRD_ID_MAPPINGS_TXT_FILE_NAME); + FileUtil.validateFile(hprdIdMappingFile); + return new HprdIdMappingsTxtFileParser(hprdIdMappingFile, CharacterEncoding.US_ASCII); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + }, + /** + * The TRANSFAC gene.dat and matrix.dat files must be obtained manually. + * They are assumed to already be in place when RDF generation commences. + */ + TRANSFAC_GENE(DataSource.TRANSFAC) { + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + File transfacGeneDatFile = new File(sourceFileDirectory, TransfacGeneDatFileParser.GENE_DAT_FILE_NAME); + FileUtil.validateFile(transfacGeneDatFile); + return new TransfacGeneDatFileParser(transfacGeneDatFile, CharacterEncoding.ISO_8859_1); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + }, + + TRANSFAC_MATRIX(DataSource.TRANSFAC) { + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + File transfacMatrixDatFile = new File(sourceFileDirectory, TransfacMatrixDatFileParser.MATRIX_DAT_FILE_NAME); + FileUtil.validateFile(transfacMatrixDatFile); + return new TransfacMatrixDatFileParser(transfacMatrixDatFile, CharacterEncoding.ISO_8859_1); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + }, + /** + * The GAD all.txt data file must be obtained manually. It is assumed to + * already be in place when RDF generation commences. + */ + GAD(DataSource.GAD) { + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + File gadAllTxtFile = new File(sourceFileDirectory, + GeneticAssociationDbAllTxtFileParser.GAD_ALL_TXT_FILE_NAME); + FileUtil.validateFile(gadAllTxtFile); + return new GeneticAssociationDbAllTxtFileParser(gadAllTxtFile, CharacterEncoding.US_ASCII); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + }, + /** + * + */ + OMIM(DataSource.OMIM) { + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + return new OmimTxtFileParser(sourceFileDirectory, cleanSourceFiles); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + }, + PHARMGKB_RELATION(DataSource.PHARMGKB) { + @Override + protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, + File idListDir, Set taxonIds) throws IOException { + File pharmgkbRelationshipsDataFile = new File(sourceFileDirectory, "relationships.tsv"); + return new PharmGkbRelationFileParser(pharmgkbRelationshipsDataFile, CharacterEncoding.UTF_8); + } + + @Override + protected boolean isTaxonAware() { + return false; + } + } + ; public enum Split { BY_STAGES, NONE; From 1087aa07fa8bf84795adcabfef608a32e618aba5 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Fri, 5 Feb 2016 17:26:13 -0700 Subject: [PATCH 10/36] Update to RGD FTP server URL --- .../datasource/fileparsers/rgd/RgdGeneFileRecordReaderBase.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdGeneFileRecordReaderBase.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdGeneFileRecordReaderBase.java index 02a4bb9..472fd95 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdGeneFileRecordReaderBase.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdGeneFileRecordReaderBase.java @@ -55,7 +55,7 @@ */ public class RgdGeneFileRecordReaderBase extends SingleLineFileRecordReader { - public static final String FTP_SERVER = "rgd.mcw.edu"; + public static final String FTP_SERVER = "ftp.rgd.mcw.edu"; public static final String FTP_PATH = "pub/data_release"; public static final CharacterEncoding ENCODING = CharacterEncoding.UTF_8; From 0869261802f922113d148e92cd12f8cc321754a1 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Fri, 5 Feb 2016 17:31:30 -0700 Subject: [PATCH 11/36] Update RGD FTP server URL --- .../fileparsers/rgd/RgdRatGeneMpAnnotationFileRecordReader.java | 2 +- .../rgd/RgdRatGeneNboAnnotationFileRecordReader.java | 2 +- .../fileparsers/rgd/RgdRatGenePwAnnotationFileRecordReader.java | 2 +- .../rgd/RgdRatGeneRdoAnnotationFileRecordReader.java | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneMpAnnotationFileRecordReader.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneMpAnnotationFileRecordReader.java index a22575a..38b8bc9 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneMpAnnotationFileRecordReader.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneMpAnnotationFileRecordReader.java @@ -51,7 +51,7 @@ */ public class RgdRatGeneMpAnnotationFileRecordReader extends Gaf2FileRecordReader { - @FtpDownload(server = "rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_mp", filetype = FileType.ASCII) + @FtpDownload(server = "ftp.rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_mp", filetype = FileType.ASCII) private File annotationFile; /** diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneNboAnnotationFileRecordReader.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneNboAnnotationFileRecordReader.java index 430c59e..e8b7215 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneNboAnnotationFileRecordReader.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneNboAnnotationFileRecordReader.java @@ -49,7 +49,7 @@ */ public class RgdRatGeneNboAnnotationFileRecordReader extends Gaf2FileRecordReader { - @FtpDownload(server = "rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_nbo", filetype = FileType.ASCII) + @FtpDownload(server = "ftp.rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_nbo", filetype = FileType.ASCII) private File annotationFile; /** diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGenePwAnnotationFileRecordReader.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGenePwAnnotationFileRecordReader.java index 13a84fd..13ce188 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGenePwAnnotationFileRecordReader.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGenePwAnnotationFileRecordReader.java @@ -49,7 +49,7 @@ */ public class RgdRatGenePwAnnotationFileRecordReader extends Gaf2FileRecordReader { - @FtpDownload(server = "rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_pw", filetype = FileType.ASCII) + @FtpDownload(server = "ftp.rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_pw", filetype = FileType.ASCII) private File annotationFile; /** diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneRdoAnnotationFileRecordReader.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneRdoAnnotationFileRecordReader.java index 0220dd7..234c636 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneRdoAnnotationFileRecordReader.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneRdoAnnotationFileRecordReader.java @@ -49,7 +49,7 @@ */ public class RgdRatGeneRdoAnnotationFileRecordReader extends Gaf2FileRecordReader { - @FtpDownload(server = "rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_rdo", filetype = FileType.ASCII) + @FtpDownload(server = "ftp.rgd.mcw.edu", path = "pub/data_release/annotated_rgd_objects_by_ontology/", filename = "rattus_genes_rdo", filetype = FileType.ASCII) private File annotationFile; /** From eca2ca98a3ac610d674e2f4d4e4fc5e3655887ba Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 11:35:33 -0700 Subject: [PATCH 12/36] Parsers no longer return null for unknown ids They use the UnknownDataSourceIdentifier or ProbableErrorDataSourceIdentifier classes to return IDs that either don't have a mapping to a proper URI or appear to be errors, e.g. a UniProt ID that doesn't match the UniProt ID regular expression --- .../drugbank/DrugBankDrugRecord.java | 16 +- .../ebi/uniprot/UniProtFileRecord.java | 10 +- .../GeneticAssociationDbAllTxtFileData.java | 41 +- .../hgnc/HgncDownloadFileParser.java | 20 +- .../hprd/HprdIdMappingsTxtFileParser.java | 7 +- .../irefweb/IRefWebPsiMitab2_6FileParser.java | 195 ++++---- .../kegg/KeggGeneIdListFileData.java | 10 +- .../fileparsers/kegg/KeggGenesFileData.java | 3 +- .../mgi/MRKSequenceFileParser.java | 10 +- .../ncbi/gene/EntrezGeneInfoFileParser.java | 8 +- .../pharmgkb/PharmGkbGeneFileParser.java | 462 +++++++++--------- .../pharmgkb/PharmGkbRelationFileParser.java | 7 +- .../fileparsers/pro/ProMappingFileParser.java | 39 +- .../rgd/RgdAnnotationFileIdResolver.java | 22 +- .../IRefWebPsiMitab2_6FileParserTest.java | 137 +++--- .../pro/ProMappingFileParserTest.java | 66 ++- .../fileparsers/pro/PRO_promapping.txt | 2 +- .../datasource/identifiers/DataSource.java | 14 +- .../identifiers/DataSourceElement.java | 2 +- .../identifiers/DataSourceIdResolver.java | 79 +-- .../identifiers/DataSourceIdentifier.java | 2 +- .../NucleotideAccessionResolver.java | 10 +- .../ProbableErrorDataSourceIdentifier.java | 65 +++ .../identifiers/ProteinAccessionResolver.java | 11 +- .../UnknownDataSourceIdentifier.java | 21 + 25 files changed, 721 insertions(+), 538 deletions(-) create mode 100644 datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java create mode 100644 datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java index bdf8e53..e98a736 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java @@ -147,7 +147,9 @@ import edu.ucdenver.ccp.datasource.identifiers.DataSource; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID; import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugsProductDatabaseID; import edu.ucdenver.ccp.datasource.identifiers.ebi.interpro.PfamID; @@ -1304,14 +1306,14 @@ private static DataSourceIdentifier resolveIdentifier(String resource, String } catch (IllegalArgumentException e) { if (identifier.matches("\\d+")) { return new GiNumberID(identifier); + } else { + return new ProbableErrorDataSourceIdentifier("identifier", "GenBank", + "Observed invalid GenBank protein identifier: " + identifier); } - logger.warn("Observed invalid GenBank protein identifier: " + identifier); - return null; } } else if (resource.equals("GenBank")) { - try { - return NucleotideAccessionResolver.resolveNucleotideAccession(identifier); - } catch (IllegalArgumentException e) { + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier); + if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) { return ProteinAccessionResolver.resolveProteinAccession(identifier); } } else if (resource.equals("UniProtKB")) { @@ -1354,7 +1356,7 @@ private static DataSourceIdentifier resolveIdentifier(String resource, String id = new UniProtID(identifier); } catch (IllegalArgumentException e) { logger.warn("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")"); - return null; + return new UnknownDataSourceIdentifier(identifier, resource); } if (id != null) { return id; @@ -1362,7 +1364,7 @@ private static DataSourceIdentifier resolveIdentifier(String resource, String } System.out.println("Unhandled identifier type: " + resource + " (identifier=" + identifier + ")"); - return null; + return new UnknownDataSourceIdentifier(identifier, resource); // throw new IllegalArgumentException("Unhandled identifier type: " + // resource + // " (identifier=" + identifier diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java index 87491f9..eeaccde 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtFileRecord.java @@ -80,6 +80,8 @@ import edu.ucdenver.ccp.datasource.fileparsers.RecordField; import edu.ucdenver.ccp.datasource.identifiers.DataSource; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID; import edu.ucdenver.ccp.datasource.identifiers.drugbank.DrugBankID; import edu.ucdenver.ccp.datasource.identifiers.ebi.embl.EmblID; @@ -899,14 +901,10 @@ private DataSourceIdentifier resolveDatabaseIdentifer(String type, String idS return new PirnrId(idStr); } } catch (IllegalArgumentException e) { - logger.warn("Invalid identifier detected: " + e.getMessage()); - return null; + return new ProbableErrorDataSourceIdentifier(idStr, type, e.getMessage()); } - // throw new IllegalArgumentException("Unhandled identifier type: " - // + type + " :: " + idStr); - logger.warn("Unhandled identifier type: " + type + " :: " + idStr); - return null; + return new UnknownDataSourceIdentifier(idStr, type); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java index 1fb4bc8..33ff25a 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java @@ -73,22 +73,24 @@ * * @author Bill Baumgartner * - * ID _________ Association(Y/N) _________ Broad Phenotype Disease Class _________ Disease - * Class Code _________ MeSH Disease Terms _________ Chromosom _________ Chr-Band _________ - * _________ Gene _________ DNA Start _________ DNA End P Value Reference _________ Pubmed - * ID _________ Allele Author Description _________ Allele Functional Effects _________ - * Polymophism Class _________ Gene Name _________ RefSeq _________ Population _________ - * MeSH Geolocation _________ Submitter _________ Locus Number _________ Unigene _________ - * Narrow Phenotype _________ Mole. Phenotype Journal Title _________ rs Number _________ - * OMIM ID Year _________ Conclusion _________ Study Info _________ Env. Factor _________ GI - * Gene A _________ GI Allele of Gene A _________ GI Gene B _________ GI Allele of Gene B - * _________ GI Gene C _________ GI Allele of Gene C _________ GI Association? GI combine + * ID _________ Association(Y/N) _________ Broad Phenotype Disease Class + * _________ Disease Class Code _________ MeSH Disease Terms _________ + * Chromosom _________ Chr-Band _________ _________ Gene _________ DNA + * Start _________ DNA End P Value Reference _________ Pubmed ID + * _________ Allele Author Description _________ Allele Functional + * Effects _________ Polymophism Class _________ Gene Name _________ + * RefSeq _________ Population _________ MeSH Geolocation _________ + * Submitter _________ Locus Number _________ Unigene _________ Narrow + * Phenotype _________ Mole. Phenotype Journal Title _________ rs Number + * _________ OMIM ID Year _________ Conclusion _________ Study Info + * _________ Env. Factor _________ GI Gene A _________ GI Allele of Gene + * A _________ GI Gene B _________ GI Allele of Gene B _________ GI Gene + * C _________ GI Allele of Gene C _________ GI Association? GI combine * Env. Factor _________ GI relevant to Disease */ -@Record(dataSource = DataSource.GAD, schemaVersion="2", comment="Schema version is 2 b/c one field was dropped: GAD/CDC", label="GAD record") +@Record(dataSource = DataSource.GAD, schemaVersion = "2", comment = "Schema version is 2 b/c one field was dropped: GAD/CDC", label = "GAD record") public class GeneticAssociationDbAllTxtFileData extends SingleLineFileRecord { - private static final Logger logger = Logger.getLogger(GeneticAssociationDbAllTxtFileData.class); @@ -422,7 +424,7 @@ public boolean hasAssociation() { } public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxtLine(Line line) { - String[] toks = line.getText().split("\\t",-1); + String[] toks = line.getText().split("\\t", -1); if (toks.length < 23) { logger.warn("Invalid line detected (" + line.getLineNumber() + "): " + line.getText()); } @@ -454,9 +456,10 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt String geneName = toks[17]; String refseqURL = null; try { - refseqURL = toks[18]; + refseqURL = toks[18]; } catch (ArrayIndexOutOfBoundsException e) { - logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length+" Message: " + e.getMessage() + " LINE: " + line.getText()); + logger.error("Caught exception. Line: (" + line.getLineNumber() + ") #toks: " + toks.length + " Message: " + + e.getMessage() + " LINE: " + line.getText()); } DataSourceIdentifier nucleotideId = null; @@ -470,13 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt if (acc.matches("\\d+")) { nucleotideId = new GiNumberID(acc); } else { - try { - nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); - } catch (IllegalArgumentException e) { - logger.info("tok: " + refseqURL + ";"); - logger.warn(e.getMessage()); - nucleotideId = null; - } + nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java index c17a690..7497744 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java @@ -59,7 +59,9 @@ import edu.ucdenver.ccp.datasource.fileparsers.hgnc.HgncDownloadFileData.SpecialistDbIdLinkPair; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; import edu.ucdenver.ccp.datasource.identifiers.ec.EnzymeCommissionID; import edu.ucdenver.ccp.datasource.identifiers.ensembl.EnsemblGeneID; @@ -551,9 +553,7 @@ private DataSourceIdentifier resolveSpecialistId(String idStr, String link) { return new SlcId(idStr); } - logger.warn("Unable to resolve id from: " + link); - return null; - // throw new IllegalArgumentException("Unknown link type: " + link); + return new UnknownDataSourceIdentifier(idStr, null); } /** @@ -564,14 +564,12 @@ private Set> resolveAccessionNumbers(String accListStr) Set> accNumbers = new HashSet>(); if (!accListStr.isEmpty()) { for (String acc : accListStr.split(",")) { - try { - accNumbers.add(NucleotideAccessionResolver.resolveNucleotideAccession(acc)); - } catch (IllegalArgumentException e) { - try { - accNumbers.add(ProteinAccessionResolver.resolveProteinAccession(acc)); - } catch (IllegalArgumentException e2) { - logger.warn("Cannot resolve: " + acc + " -- " + e.getMessage()); - } + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { + DataSourceIdentifier proAccId = ProteinAccessionResolver.resolveProteinAccession(acc); + accNumbers.add(proAccId); + } else { + accNumbers.add(nucAccId); } } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java index bb48ecc..a5417ea 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java @@ -61,6 +61,7 @@ import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; import edu.ucdenver.ccp.datasource.identifiers.hprd.HprdID; @@ -114,11 +115,11 @@ protected HprdIdMappingsTxtFileData parseRecordFromLine(Line line) { } private DataSourceIdentifier resolveAccession(String acc) { - try { - return NucleotideAccessionResolver.resolveNucleotideAccession(acc); - } catch (IllegalArgumentException e) { + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { return ProteinAccessionResolver.resolveProteinAccession(acc); } + return nucAccId; } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java index 88b70b2..2c3a615 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java @@ -73,7 +73,9 @@ import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.bind.BindInteractionID; import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID; import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID; @@ -121,8 +123,10 @@ import edu.ucdenver.ccp.identifier.publication.PubMedID; /** - * This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website - * ftp://ftp.no.embnet.org/irefindex/data/archive/release_4.0/psimi_tab/All.mitab.06042009.txt.zip + * This class is used to parse DIPYYYMMDD files which can be downloaded from the + * DIP website + * ftp://ftp.no.embnet.org/irefindex/data/archive/release_4.0/psimi_tab + * /All.mitab.06042009.txt.zip * * @author Bill Baumgartner * @see IRefWebMitab4_0FileData for file format and version specifications @@ -133,12 +137,12 @@ public class IRefWebPsiMitab2_6FileParser extends TaxonAwareSingleLineFileRecord private static final String HEADER = "#uidA\tuidB\taltA\taltB\taliasA\taliasB\tmethod\tauthor\tpmids\ttaxa\ttaxb\tinteractionType\tsourcedb\tinteractionIdentifier\tconfidence\texpansion\tbiological_role_A\tbiological_role_B\texperimental_role_A\texperimental_role_B\tinteractor_type_A\tinteractor_type_B\txrefs_A\txrefs_B\txrefs_Interaction\tAnnotations_A\tAnnotations_B\tAnnotations_Interaction\tHost_organism_taxid\tparameters_Interaction\tCreation_date\tUpdate_date\tChecksum_A\tChecksum_B\tChecksum_Interaction\tNegative\tOriginalReferenceA\tOriginalReferenceB\tFinalReferenceA\tFinalReferenceB\tMappingScoreA\tMappingScoreB\tirogida\tirogidb\tirigid\tcrogida\tcrogidb\tcrigid\ticrogida\ticrogidb\ticrigid\timex_id\tedgetype\tnumParticipants"; -// public static final String FTP_FILE_NAME = "All.mitab.03022013.txt.zip"; + // public static final String FTP_FILE_NAME = "All.mitab.03022013.txt.zip"; public static final String FTP_FILE_NAME = "All.mitab.07042015.txt.zip"; public static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; public static final String FTP_USER_NAME = "ftp"; - @FtpDownload(server = FtpHost.IREFWEB_HOST, path = "irefindex/data/archive/release_10.0/psi_mitab/MITAB2.6/", filename = FTP_FILE_NAME, filetype = FileType.BINARY, username = FTP_USER_NAME, decompress = true, targetFileName="All.mitab.04072015.txt") + @FtpDownload(server = FtpHost.IREFWEB_HOST, path = "irefindex/data/archive/release_10.0/psi_mitab/MITAB2.6/", filename = FTP_FILE_NAME, filetype = FileType.BINARY, username = FTP_USER_NAME, decompress = true, targetFileName = "All.mitab.04072015.txt") private File allMitabTxtFile; public IRefWebPsiMitab2_6FileParser(File file, CharacterEncoding encoding) throws IOException, @@ -192,7 +196,8 @@ protected String getExpectedFileHeader() throws IOException { } /** - * Extracts information from a line from a file and returns a IRefWebPsiMitab2_5FileData object. + * Extracts information from a line from a file and returns a + * IRefWebPsiMitab2_5FileData object. * * @param miOntologyTermResolver * @param line @@ -284,62 +289,61 @@ private IRefWebInteraction getInteraction(String detectionMethodStr, String auth private Set> resolveInteractionDbIds(String interactionIdStr) { Set> ids = new HashSet>(); for (String id : interactionIdStr.split(RegExPatterns.PIPE)) { - if (id.startsWith("edgetype:") || id.endsWith(":-")) { - // do nothing - this is a redundant storage of edge type or a null identifier - } else if (id.startsWith("BIND_Translation:")) { - ids.add(new BindTranslationId(StringUtil.removePrefix(id, "BIND_Translation:"))); - } else if (id.startsWith("irigid:")) { - ids.add(new IrigId(StringUtil.removePrefix(id, "irigid:"))); - } else if (id.startsWith("rigid:")) { - ids.add(new RigId(StringUtil.removePrefix(id, "rigid:"))); - } else if (id.startsWith("grid:")) { - ids.add(new BioGridID(StringUtil.removePrefix(id, "grid:"))); - } else if (id.startsWith("bind:")) { - ids.add(new BindInteractionID(StringUtil.removePrefix(id, "bind:"))); - } else if (id.startsWith("MPACT:")) { - ids.add(new MpactId(StringUtil.removePrefix(id, "MPACT:"))); - } else if (id.startsWith("mint:")) { - ids.add(new MintID(StringUtil.removePrefix(id, "mint:"))); - } else if (id.startsWith("intact:")) { - ids.add(new IntActID(StringUtil.removePrefix(id, "intact:"))); - } else if (id.startsWith("dip:")) { - ids.add(new DipInteractionID(StringUtil.removePrefix(id, "dip:"))); - } else if (id.startsWith("ophid:")) { - ids.add(new OphidId(StringUtil.removePrefix(id, "ophid:"))); - } else if (id.startsWith("InnateDB:")) { - String idbId = StringUtil.removePrefix(id, "InnateDB:"); - if (idbId.startsWith("IDB-")) { - idbId = StringUtil.removePrefix(idbId, "IDB-"); - } - ids.add(new InnateDbId(idbId)); - } else if (id.startsWith("innatedb:")) { - String idbId = StringUtil.removePrefix(id, "innatedb:"); - if (idbId.startsWith("IDB-")) { - idbId = StringUtil.removePrefix(idbId, "IDB-"); - } - ids.add(new InnateDbId(idbId)); - } else if (id.startsWith("CORUM:")) { - ids.add(new CorumId(StringUtil.removePrefix(id, "CORUM:"))); - } else if (id.startsWith("mpilit:")) { - ids.add(new MpiDbId(StringUtil.removePrefix(id, "mpilit:"))); - } else if (id.startsWith("mpiimex:")) { - ids.add(new MpiDbId(StringUtil.removePrefix(id, "mpiimex:"))); - } else if (id.startsWith("MatrixDB:")) { - ids.add(new MatrixDbId(StringUtil.removePrefix(id, "MatrixDB:"))); - } else if (id.startsWith("biogrid:")) { - ids.add(new BioGridID(StringUtil.removePrefix(id, "biogrid:"))); - } else if (id.startsWith("pubmed:")) { - ids.add(new PubMedID(StringUtil.removePrefix(id, "pubmed:"))); - } else if (id.startsWith("HPRD")) { - try { + try { + if (id.startsWith("edgetype:") || id.endsWith(":-")) { + // do nothing - this is a redundant storage of edge type or + // a null identifier + } else if (id.startsWith("BIND_Translation:")) { + ids.add(new BindTranslationId(StringUtil.removePrefix(id, "BIND_Translation:"))); + } else if (id.startsWith("irigid:")) { + ids.add(new IrigId(StringUtil.removePrefix(id, "irigid:"))); + } else if (id.startsWith("rigid:")) { + ids.add(new RigId(StringUtil.removePrefix(id, "rigid:"))); + } else if (id.startsWith("grid:")) { + ids.add(new BioGridID(StringUtil.removePrefix(id, "grid:"))); + } else if (id.startsWith("bind:")) { + ids.add(new BindInteractionID(StringUtil.removePrefix(id, "bind:"))); + } else if (id.startsWith("MPACT:")) { + ids.add(new MpactId(StringUtil.removePrefix(id, "MPACT:"))); + } else if (id.startsWith("mint:")) { + ids.add(new MintID(StringUtil.removePrefix(id, "mint:"))); + } else if (id.startsWith("intact:")) { + ids.add(new IntActID(StringUtil.removePrefix(id, "intact:"))); + } else if (id.startsWith("dip:")) { + ids.add(new DipInteractionID(StringUtil.removePrefix(id, "dip:"))); + } else if (id.startsWith("ophid:")) { + ids.add(new OphidId(StringUtil.removePrefix(id, "ophid:"))); + } else if (id.startsWith("InnateDB:")) { + String idbId = StringUtil.removePrefix(id, "InnateDB:"); + if (idbId.startsWith("IDB-")) { + idbId = StringUtil.removePrefix(idbId, "IDB-"); + } + ids.add(new InnateDbId(idbId)); + } else if (id.startsWith("innatedb:")) { + String idbId = StringUtil.removePrefix(id, "innatedb:"); + if (idbId.startsWith("IDB-")) { + idbId = StringUtil.removePrefix(idbId, "IDB-"); + } + ids.add(new InnateDbId(idbId)); + } else if (id.startsWith("CORUM:")) { + ids.add(new CorumId(StringUtil.removePrefix(id, "CORUM:"))); + } else if (id.startsWith("mpilit:")) { + ids.add(new MpiDbId(StringUtil.removePrefix(id, "mpilit:"))); + } else if (id.startsWith("mpiimex:")) { + ids.add(new MpiDbId(StringUtil.removePrefix(id, "mpiimex:"))); + } else if (id.startsWith("MatrixDB:")) { + ids.add(new MatrixDbId(StringUtil.removePrefix(id, "MatrixDB:"))); + } else if (id.startsWith("biogrid:")) { + ids.add(new BioGridID(StringUtil.removePrefix(id, "biogrid:"))); + } else if (id.startsWith("pubmed:")) { + ids.add(new PubMedID(StringUtil.removePrefix(id, "pubmed:"))); + } else if (id.startsWith("HPRD")) { ids.add(new HprdID(StringUtil.removePrefix(id, "HPRD:"))); - } catch (IllegalArgumentException e) { - logger.warn(e.getMessage()); + } else { + ids.add(new UnknownDataSourceIdentifier(id, null)); } - } else { - // throw new IllegalArgumentException("Unknown id prefix: " + id); - logger.warn("Unknown id prefix: " + id); - // return null; + } catch (IllegalArgumentException e) { + ids.add(new ProbableErrorDataSourceIdentifier(id, null, e.getMessage())); } } return ids; @@ -362,10 +366,10 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return null; } if (idStr.startsWith("xx:")) { - return null; + return new UnknownDataSourceIdentifier(idStr, null); } if (idStr.startsWith("other:")) { - return null; + return new UnknownDataSourceIdentifier(idStr, null); } if (idStr.equals("null")) { return null; @@ -485,14 +489,10 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { return new EntrezGeneID(StringUtil.removePrefix(idStr, "entrezgene:")); } } catch (IllegalArgumentException e) { - logger.warn("Invalid identifier due to " + e.getMessage()); - logger.warn("Trying identifier as GenBank ID..."); - return getGenbankAccession(idStr); + return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } - // throw new IllegalArgumentException("Unknown id prefix: " + idStr); - logger.warn("Unknown id prefix: " + idStr); - return null; + return new UnknownDataSourceIdentifier(idStr, null); } /** @@ -526,15 +526,11 @@ private DataSourceIdentifier getRefseqAccession(String acc) { * @return */ private DataSourceIdentifier getGenbankAccession(String acc) { - try { - return NucleotideAccessionResolver.resolveNucleotideAccession(acc); - } catch (IllegalArgumentException e) { - try { - return ProteinAccessionResolver.resolveProteinAccession(acc); - } catch (IllegalArgumentException e2) { - logger.warn("Detected invalid GenBank accession: " + acc); - return null; - } + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { + return ProteinAccessionResolver.resolveProteinAccession(acc); + } else { + return nucAccId; } } @@ -634,24 +630,24 @@ private IRefWebInteractor getInteractor(String uniqueIdStr, String altIdStr, Str private Set resolveAliasSymbols(String aliasStr) { Set aliases = new HashSet(); for (String alias : aliasStr.split(RegExPatterns.PIPE)) { - String aliasSymbol = resolveAliasSymbol(alias); - if (aliasSymbol != null) { + String aliasSymbol = alias;//resolveAliasSymbol(alias); + if (aliasSymbol != null && !aliasSymbol.equals("-")) { aliases.add(aliasSymbol); } } return aliases; } - /** - * @param alias - * @return - */ - private String resolveAliasSymbol(String aliasStr) { - if (aliasStr.startsWith("entrezgene/locuslink:")) { - return new String(StringUtil.removePrefix(aliasStr, "entrezgene/locuslink:")); - } - return null; - } +// /** +// * @param alias +// * @return +// */ +// private String resolveAliasSymbol(String aliasStr) { +// if (aliasStr.startsWith("entrezgene/locuslink:")) { +// return new String(StringUtil.removePrefix(aliasStr, "entrezgene/locuslink:")); +// } +// return aliasStr; +// } /** * @param aliasStr @@ -678,7 +674,8 @@ private DataSourceIdentifier resolveAliasId(String aliasStr) { if (aliasStr.startsWith("uniprotkb:")) { return new UniProtEntryName(StringUtil.removePrefix(aliasStr, "uniprotkb:")); } else if (aliasStr.startsWith("entrezgene/locuslink:")) { - // ignore, it is a gene symbol and is handled by resolveAliasSymbols() + // ignore, it is a gene symbol and is handled by + // resolveAliasSymbols() return null; } else if (aliasStr.startsWith("crogid:")) { return new CrogId(StringUtil.removePrefix(aliasStr, "crogid:")); @@ -691,25 +688,7 @@ private DataSourceIdentifier resolveAliasId(String aliasStr) { } else if (aliasStr.startsWith("hgnc:")) { return new HgncGeneSymbolID(StringUtil.removePrefix(aliasStr, "hgnc:")); } - throw new IllegalArgumentException("Unknown id prefix: " + aliasStr); - } - - public static void main(String[] args) { - BasicConfigurator.configure(); - File irefwebFile = new File("/tmp/irefweb.sample"); - try { - IRefWebPsiMitab2_6FileParser parser = new IRefWebPsiMitab2_6FileParser(irefwebFile, - CharacterEncoding.US_ASCII); - while (parser.hasNext()) { - parser.next(); - } - } catch (IllegalArgumentException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } catch (IOException e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } + return new UnknownDataSourceIdentifier(aliasStr, null); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGeneIdListFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGeneIdListFileData.java index 2cdbc52..07ce879 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGeneIdListFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGeneIdListFileData.java @@ -1,4 +1,3 @@ - package edu.ucdenver.ccp.datasource.fileparsers.kegg; /* @@ -76,13 +75,8 @@ public static KeggGeneIdListFileData parseKeggGeneIDListLine(Line line) { String keggGeneIDStr = toks[0].substring(toks[0].indexOf(":") + 1); KeggGeneID keggInternalGeneID = new KeggGeneID(keggGeneIDStr); DataSourceIdentifier externalGeneID = DataSourceIdResolver.resolveId(toks[1]); - if (externalGeneID != null) - return new KeggGeneIdListFileData(keggInternalGeneID, externalGeneID, line.getByteOffset(), - line.getLineNumber()); - - logger.error("External gene id was not resolved from " + toks[1]); - return null; - + return new KeggGeneIdListFileData(keggInternalGeneID, externalGeneID, line.getByteOffset(), + line.getLineNumber()); } logger.error("Unexpected number of tokens (" + toks.length + ") on line: " + line.toString()); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGenesFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGenesFileData.java index daf9cde..b53d36e 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGenesFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/kegg/KeggGenesFileData.java @@ -144,8 +144,9 @@ private static Set> getDbLink(String line) { String databaseName = toks[0].replaceAll(":", ""); for (int i = 1; i < toks.length; i++) { DataSourceIdentifier id = DataSourceIdResolver.resolveId(databaseName, toks[i]); - if (id != null) + if (id != null) { ids.add(id); + } } ids.remove(null); return ids; diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java index e2d7e1d..56e2ac3 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java @@ -132,13 +132,9 @@ protected MRKSequenceFileData parseRecordFromLine(Line line) { String[] genBankIDs = toks[10].split(RegExPatterns.PIPE); for (String genBankID : genBankIDs) { if (genBankID.trim().length() > 0) { - try { - DataSourceIdentifier resolveNucleotideAccession = NucleotideAccessionResolver - .resolveNucleotideAccession(genBankID); - genBankAccessionIDs.add(resolveNucleotideAccession); - } catch (IllegalArgumentException e) { - logger.warn("Unable to resolve supposed GenBank id: " + genBankID); - } + DataSourceIdentifier resolveNucleotideAccession = NucleotideAccessionResolver + .resolveNucleotideAccession(genBankID); + genBankAccessionIDs.add(resolveNucleotideAccession); } } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneInfoFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneInfoFileParser.java index e69f03f..3761a99 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneInfoFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneInfoFileParser.java @@ -161,13 +161,7 @@ public static EntrezGeneInfoFileData parseGeneInfoLine(Line line) { Set> dbXrefs = new HashSet>(); if (!toks[5].equals("-")) { for (String id : toks[5].split("\\|")) { - DataSourceIdentifier resolveGeneID = null; - try { - resolveGeneID = DataSourceIdResolver.resolveId(id); - } catch (IllegalArgumentException e) { - logger.warn("Exception during ID resolution for id: " + id); - resolveGeneID = null; - } + DataSourceIdentifier resolveGeneID = DataSourceIdResolver.resolveId(id); if (resolveGeneID != null) { dbXrefs.add(resolveGeneID); } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java index 50acece..98d0afb 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java @@ -33,233 +33,235 @@ * #L% */ - -import java.io.File; -import java.io.IOException; -import java.util.ArrayList; -import java.util.Collection; -import java.util.List; - -import org.apache.commons.lang.StringUtils; -import org.apache.log4j.Logger; - -import edu.ucdenver.ccp.common.download.HttpDownload; -import edu.ucdenver.ccp.common.file.CharacterEncoding; -import edu.ucdenver.ccp.common.file.reader.Line; -import edu.ucdenver.ccp.common.file.reader.StreamLineReader; -import edu.ucdenver.ccp.common.string.RegExPatterns; -import edu.ucdenver.ccp.common.string.StringConstants; -import edu.ucdenver.ccp.common.string.StringUtil; -import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; -import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; -import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; -import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; -import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; -import edu.ucdenver.ccp.datasource.identifiers.ensembl.EnsemblGeneID; -import edu.ucdenver.ccp.datasource.identifiers.hgnc.HgncID; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.omim.OmimID; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.refseq.RefSeqID; -import edu.ucdenver.ccp.datasource.identifiers.obo.GeneOntologyID; -import edu.ucdenver.ccp.datasource.identifiers.other.AlfredId; -import edu.ucdenver.ccp.datasource.identifiers.other.CrossReferenceUrl; -import edu.ucdenver.ccp.datasource.identifiers.other.CtdId; -import edu.ucdenver.ccp.datasource.identifiers.other.GenAtlasId; -import edu.ucdenver.ccp.datasource.identifiers.other.GeneCardId; -import edu.ucdenver.ccp.datasource.identifiers.other.HugeId; -import edu.ucdenver.ccp.datasource.identifiers.other.HumanCycGeneId; -import edu.ucdenver.ccp.datasource.identifiers.other.IupharReceptorId; -import edu.ucdenver.ccp.datasource.identifiers.other.ModBaseId; -import edu.ucdenver.ccp.datasource.identifiers.other.MutDbId; -import edu.ucdenver.ccp.datasource.identifiers.other.UcscGenomeBrowserId; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; - -/** - * The file format for the genes.tsv file has changed. This parser should be updated. New header: - * PharmGKB Accession Id Entrez Id Ensembl Id Name Symbol Alternate Names Alternate Symbols Is VIP - * Has Variant Annotation Cross-references - * - * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu - * - */ -public class PharmGkbGeneFileParser extends SingleLineFileRecordReader { - - private static final Logger logger = Logger.getLogger(PharmGkbGeneFileParser.class); - - private static final String HEADER = "PharmGKB Accession Id\tEntrez Id\tEnsembl Id\tName\tSymbol\tAlternate Names\tAlternate Symbols\tIs VIP\tHas Variant Annotation\tCross-references\tHas CPIC Dosing Guideline\tChromosome\tChromosomal Start\tChromosomal Stop"; - - private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; - - private static final String HUMANCYCGENE_PREFIX = "HumanCycGene:"; - - private static final String ALFRED_PREFIX = "alfred:"; - - private static final String CTD_PREFIX = "ctd:"; - - private static final String ENSEMBL_PREFIX = "ensembl:"; - - private static final String ENTREZGENE_PREFIX = "entrezGene:"; - - private static final String GENEATLAS_PREFIX = "genAtlas:"; - - private static final String GENECARD_PREFIX = "geneCard:"; - - private static final String GO_PREFIX = "go:"; - - private static final String HGNC_PREFIX = "hgnc:"; - - private static final String HUGE_PREFIX = "huge:"; - - private static final String IUPHAR_RECEPTOR_PREFIX = "iupharReceptor:"; - - private static final String MODBASE_PREFIX = "modBase:"; - - private static final String MUTDB_PREFIX = "mutDb:"; - - private static final String OMIM_PREFIX = "omim:"; - - private static final String REFSEQDNA_PREFIX = "refSeqDna:"; - - private static final String REFSEQPROTEIN_PREFIX = "refSeqProtein:"; - - private static final String REFSEQRNA_PREFIX = "refSeqRna:"; - - private static final String UCSCGENOMEBROWSER_PREFIX = "ucscGenomeBrowser:"; - - private static final String UNIPROT_PREFIX = "uniProtKb:"; - - private static final String URL_PREFIX = "url:"; - - @HttpDownload(url = "https://www.pharmgkb.org/download.do?objId=genes.zip&dlCls=common", fileName = "genes.zip", targetFileName = "genes.tsv", decompress = true) - private File pharmGkbGenesFile; - - public PharmGkbGeneFileParser(File dataFile, CharacterEncoding encoding) throws IOException { - super(dataFile, encoding, null); - } - - public PharmGkbGeneFileParser(File workDirectory, boolean clean) throws IOException { - super(workDirectory, ENCODING, null, null, null, clean); - } - - @Override - protected StreamLineReader initializeLineReaderFromDownload(CharacterEncoding encoding, String skipLinePrefix) - throws IOException { - return new StreamLineReader(pharmGkbGenesFile, encoding, skipLinePrefix); - } - - @Override - protected String getFileHeader() throws IOException { - return readLine().getText(); - } - - @Override - protected String getExpectedFileHeader() throws IOException { - return HEADER; - } - - @Override - protected PharmGkbGeneFileRecord parseRecordFromLine(Line line) { - String[] toks = line.getText().split(RegExPatterns.TAB, -1); - PharmGkbID pharmGkbAccessionId = new PharmGkbID(toks[0]); - EntrezGeneID entrezGeneId = StringUtils.isNotBlank(toks[1]) ? new EntrezGeneID(toks[1]) : null; - EnsemblGeneID ensemblGeneId = StringUtils.isNotBlank(toks[2]) ? new EnsemblGeneID(toks[2]) : null; - String name = StringUtils.isNotBlank(toks[3]) ? new String(toks[3]) : null; - String symbol = StringUtils.isNotBlank(toks[4]) ? new String(toks[4]) : null; - Collection alternativeNames = new ArrayList(); - if (!toks[5].isEmpty()) { - List alternativeNameStrs = StringUtil.delimitAndTrim(toks[5], StringConstants.COMMA, - StringConstants.QUOTATION_MARK, RemoveFieldEnclosures.TRUE); - for (String altNameStr : alternativeNameStrs) { - alternativeNames.add(new String(altNameStr)); - } - } - Collection alternativeSymbols = new ArrayList(); - if (!toks[6].isEmpty()) { - List alternativeSymbolStrs = StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, - StringConstants.QUOTATION_MARK, RemoveFieldEnclosures.TRUE); - for (String altSymbolStr : alternativeSymbolStrs) { - alternativeSymbols.add(new String(altSymbolStr)); - } - } - boolean isVip = Boolean.parseBoolean(toks[7]); - boolean hasVariantAnnotation = Boolean.parseBoolean(toks[8]); - Collection> crossReferences = new ArrayList>(); - if (!toks[9].isEmpty()) { - for (String refStr : toks[9].split(",")) { - DataSourceIdentifier id = null; - try { - id = resolveCrossRefId(refStr); - } catch (IllegalArgumentException e) { - logger.warn("Unable to resolve cross-reference: " + refStr + " due to: " + e.getMessage()); - } - if (id != null) { - crossReferences.add(id); - } - } - } - boolean hasCpicDosingGuideline = Boolean.parseBoolean(toks[10]); - - String chromosome = (toks[11].equalsIgnoreCase("null")) ? null : toks[11]; - Integer chromosomeStart = (toks[12].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[12]); - Integer chromosomeEnd = (toks[13].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[13]); - - return new PharmGkbGeneFileRecord(pharmGkbAccessionId, entrezGeneId, ensemblGeneId, name, symbol, - alternativeNames, alternativeSymbols, isVip, hasVariantAnnotation, crossReferences, - hasCpicDosingGuideline, chromosome, chromosomeStart, chromosomeEnd, line.getByteOffset(), - line.getLineNumber()); - } - - /** - * @param refStr - * @return - */ - private DataSourceIdentifier resolveCrossRefId(String refStr) { - if (refStr.startsWith(HUMANCYCGENE_PREFIX)) { - return new HumanCycGeneId(StringUtil.removePrefix(refStr, HUMANCYCGENE_PREFIX)); - } else if (refStr.startsWith(ALFRED_PREFIX)) { - return new AlfredId(StringUtil.removePrefix(refStr, ALFRED_PREFIX)); - } else if (refStr.startsWith(CTD_PREFIX)) { - return new CtdId(StringUtil.removePrefix(refStr, CTD_PREFIX)); - } else if (refStr.startsWith(ENSEMBL_PREFIX)) { - return new EnsemblGeneID(StringUtil.removePrefix(refStr, ENSEMBL_PREFIX)); - } else if (refStr.startsWith(ENTREZGENE_PREFIX)) { - return new EntrezGeneID(StringUtil.removePrefix(refStr, ENTREZGENE_PREFIX)); - } else if (refStr.startsWith(GENEATLAS_PREFIX)) { - return new GenAtlasId(StringUtil.removePrefix(refStr, GENEATLAS_PREFIX)); - } else if (refStr.startsWith(GENECARD_PREFIX)) { - return new GeneCardId(StringUtil.removePrefix(refStr, GENECARD_PREFIX)); - } else if (refStr.startsWith(GO_PREFIX)) { - return new GeneOntologyID(StringUtil.removePrefix(refStr, GO_PREFIX)); - } else if (refStr.startsWith(HGNC_PREFIX)) { - return new HgncID(StringUtil.removePrefix(refStr, HGNC_PREFIX)); - } else if (refStr.startsWith(HUGE_PREFIX)) { - return new HugeId(StringUtil.removePrefix(refStr, HUGE_PREFIX)); - } else if (refStr.startsWith(IUPHAR_RECEPTOR_PREFIX)) { - return new IupharReceptorId(StringUtil.removePrefix(refStr, IUPHAR_RECEPTOR_PREFIX)); - } else if (refStr.startsWith(MODBASE_PREFIX)) { - return new ModBaseId(StringUtil.removePrefix(refStr, MODBASE_PREFIX)); - } else if (refStr.startsWith(MUTDB_PREFIX)) { - return new MutDbId(StringUtil.removePrefix(refStr, MUTDB_PREFIX)); - } else if (refStr.startsWith(OMIM_PREFIX)) { - return new OmimID(StringUtil.removePrefix(refStr, OMIM_PREFIX)); - } else if (refStr.startsWith(REFSEQDNA_PREFIX)) { - return NucleotideAccessionResolver.resolveNucleotideAccession(StringUtil.removePrefix(refStr, - REFSEQDNA_PREFIX)); - } else if (refStr.startsWith(REFSEQRNA_PREFIX)) { - return new RefSeqID(StringUtil.removePrefix(refStr, REFSEQRNA_PREFIX)); - } else if (refStr.startsWith(REFSEQPROTEIN_PREFIX)) { - return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(refStr, - REFSEQPROTEIN_PREFIX)); - } else if (refStr.startsWith(UCSCGENOMEBROWSER_PREFIX)) { - return new UcscGenomeBrowserId(StringUtil.removePrefix(refStr, UCSCGENOMEBROWSER_PREFIX)); - } else if (refStr.startsWith(UNIPROT_PREFIX)) { - return new UniProtID(StringUtil.removePrefix(refStr, UNIPROT_PREFIX)); - } else if (refStr.startsWith(URL_PREFIX)) { - return new CrossReferenceUrl(StringUtil.removePrefix(refStr, URL_PREFIX)); - } else { - throw new IllegalArgumentException("Unknown cross-reference prefix: " + refStr); - } - } - -} +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.apache.commons.lang.StringUtils; +import org.apache.log4j.Logger; + +import edu.ucdenver.ccp.common.download.HttpDownload; +import edu.ucdenver.ccp.common.file.CharacterEncoding; +import edu.ucdenver.ccp.common.file.reader.Line; +import edu.ucdenver.ccp.common.file.reader.StreamLineReader; +import edu.ucdenver.ccp.common.string.RegExPatterns; +import edu.ucdenver.ccp.common.string.StringConstants; +import edu.ucdenver.ccp.common.string.StringUtil; +import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; +import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.NucleotideAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; +import edu.ucdenver.ccp.datasource.identifiers.ensembl.EnsemblGeneID; +import edu.ucdenver.ccp.datasource.identifiers.hgnc.HgncID; +import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; +import edu.ucdenver.ccp.datasource.identifiers.ncbi.omim.OmimID; +import edu.ucdenver.ccp.datasource.identifiers.ncbi.refseq.RefSeqID; +import edu.ucdenver.ccp.datasource.identifiers.obo.GeneOntologyID; +import edu.ucdenver.ccp.datasource.identifiers.other.AlfredId; +import edu.ucdenver.ccp.datasource.identifiers.other.CrossReferenceUrl; +import edu.ucdenver.ccp.datasource.identifiers.other.CtdId; +import edu.ucdenver.ccp.datasource.identifiers.other.GenAtlasId; +import edu.ucdenver.ccp.datasource.identifiers.other.GeneCardId; +import edu.ucdenver.ccp.datasource.identifiers.other.HugeId; +import edu.ucdenver.ccp.datasource.identifiers.other.HumanCycGeneId; +import edu.ucdenver.ccp.datasource.identifiers.other.IupharReceptorId; +import edu.ucdenver.ccp.datasource.identifiers.other.ModBaseId; +import edu.ucdenver.ccp.datasource.identifiers.other.MutDbId; +import edu.ucdenver.ccp.datasource.identifiers.other.UcscGenomeBrowserId; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; + +/** + * The file format for the genes.tsv file has changed. This parser should be + * updated. New header: PharmGKB Accession Id Entrez Id Ensembl Id Name Symbol + * Alternate Names Alternate Symbols Is VIP Has Variant Annotation + * Cross-references + * + * @author Colorado Computational Pharmacology, UC Denver; + * ccpsupport@ucdenver.edu + * + */ +public class PharmGkbGeneFileParser extends SingleLineFileRecordReader { + + private static final Logger logger = Logger.getLogger(PharmGkbGeneFileParser.class); + + private static final String HEADER = "PharmGKB Accession Id\tEntrez Id\tEnsembl Id\tName\tSymbol\tAlternate Names\tAlternate Symbols\tIs VIP\tHas Variant Annotation\tCross-references\tHas CPIC Dosing Guideline\tChromosome\tChromosomal Start\tChromosomal Stop"; + + private static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; + + private static final String HUMANCYCGENE_PREFIX = "HumanCycGene:"; + + private static final String ALFRED_PREFIX = "alfred:"; + + private static final String CTD_PREFIX = "ctd:"; + + private static final String ENSEMBL_PREFIX = "ensembl:"; + + private static final String ENTREZGENE_PREFIX = "entrezGene:"; + + private static final String GENEATLAS_PREFIX = "genAtlas:"; + + private static final String GENECARD_PREFIX = "geneCard:"; + + private static final String GO_PREFIX = "go:"; + + private static final String HGNC_PREFIX = "hgnc:"; + + private static final String HUGE_PREFIX = "huge:"; + + private static final String IUPHAR_RECEPTOR_PREFIX = "iupharReceptor:"; + + private static final String MODBASE_PREFIX = "modBase:"; + + private static final String MUTDB_PREFIX = "mutDb:"; + + private static final String OMIM_PREFIX = "omim:"; + + private static final String REFSEQDNA_PREFIX = "refSeqDna:"; + + private static final String REFSEQPROTEIN_PREFIX = "refSeqProtein:"; + + private static final String REFSEQRNA_PREFIX = "refSeqRna:"; + + private static final String UCSCGENOMEBROWSER_PREFIX = "ucscGenomeBrowser:"; + + private static final String UNIPROT_PREFIX = "uniProtKb:"; + + private static final String URL_PREFIX = "url:"; + + @HttpDownload(url = "https://www.pharmgkb.org/download.do?objId=genes.zip&dlCls=common", fileName = "genes.zip", targetFileName = "genes.tsv", decompress = true) + private File pharmGkbGenesFile; + + public PharmGkbGeneFileParser(File dataFile, CharacterEncoding encoding) throws IOException { + super(dataFile, encoding, null); + } + + public PharmGkbGeneFileParser(File workDirectory, boolean clean) throws IOException { + super(workDirectory, ENCODING, null, null, null, clean); + } + + @Override + protected StreamLineReader initializeLineReaderFromDownload(CharacterEncoding encoding, String skipLinePrefix) + throws IOException { + return new StreamLineReader(pharmGkbGenesFile, encoding, skipLinePrefix); + } + + @Override + protected String getFileHeader() throws IOException { + return readLine().getText(); + } + + @Override + protected String getExpectedFileHeader() throws IOException { + return HEADER; + } + + @Override + protected PharmGkbGeneFileRecord parseRecordFromLine(Line line) { + String[] toks = line.getText().split(RegExPatterns.TAB, -1); + PharmGkbID pharmGkbAccessionId = new PharmGkbID(toks[0]); + EntrezGeneID entrezGeneId = StringUtils.isNotBlank(toks[1]) ? new EntrezGeneID(toks[1]) : null; + EnsemblGeneID ensemblGeneId = StringUtils.isNotBlank(toks[2]) ? new EnsemblGeneID(toks[2]) : null; + String name = StringUtils.isNotBlank(toks[3]) ? new String(toks[3]) : null; + String symbol = StringUtils.isNotBlank(toks[4]) ? new String(toks[4]) : null; + Collection alternativeNames = new ArrayList(); + if (!toks[5].isEmpty()) { + List alternativeNameStrs = StringUtil.delimitAndTrim(toks[5], StringConstants.COMMA, + StringConstants.QUOTATION_MARK, RemoveFieldEnclosures.TRUE); + for (String altNameStr : alternativeNameStrs) { + alternativeNames.add(new String(altNameStr)); + } + } + Collection alternativeSymbols = new ArrayList(); + if (!toks[6].isEmpty()) { + List alternativeSymbolStrs = StringUtil.delimitAndTrim(toks[6], StringConstants.COMMA, + StringConstants.QUOTATION_MARK, RemoveFieldEnclosures.TRUE); + for (String altSymbolStr : alternativeSymbolStrs) { + alternativeSymbols.add(new String(altSymbolStr)); + } + } + boolean isVip = Boolean.parseBoolean(toks[7]); + boolean hasVariantAnnotation = Boolean.parseBoolean(toks[8]); + Collection> crossReferences = new ArrayList>(); + if (!toks[9].isEmpty()) { + for (String refStr : toks[9].split(",")) { + DataSourceIdentifier id = resolveCrossRefId(refStr); + if (id != null) { + crossReferences.add(id); + } + } + } + boolean hasCpicDosingGuideline = Boolean.parseBoolean(toks[10]); + + String chromosome = (toks[11].equalsIgnoreCase("null")) ? null : toks[11]; + Integer chromosomeStart = (toks[12].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[12]); + Integer chromosomeEnd = (toks[13].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[13]); + + return new PharmGkbGeneFileRecord(pharmGkbAccessionId, entrezGeneId, ensemblGeneId, name, symbol, + alternativeNames, alternativeSymbols, isVip, hasVariantAnnotation, crossReferences, + hasCpicDosingGuideline, chromosome, chromosomeStart, chromosomeEnd, line.getByteOffset(), + line.getLineNumber()); + } + + /** + * @param refStr + * @return + */ + private DataSourceIdentifier resolveCrossRefId(String refStr) { + try { + if (refStr.startsWith(HUMANCYCGENE_PREFIX)) { + return new HumanCycGeneId(StringUtil.removePrefix(refStr, HUMANCYCGENE_PREFIX)); + } else if (refStr.startsWith(ALFRED_PREFIX)) { + return new AlfredId(StringUtil.removePrefix(refStr, ALFRED_PREFIX)); + } else if (refStr.startsWith(CTD_PREFIX)) { + return new CtdId(StringUtil.removePrefix(refStr, CTD_PREFIX)); + } else if (refStr.startsWith(ENSEMBL_PREFIX)) { + return new EnsemblGeneID(StringUtil.removePrefix(refStr, ENSEMBL_PREFIX)); + } else if (refStr.startsWith(ENTREZGENE_PREFIX)) { + return new EntrezGeneID(StringUtil.removePrefix(refStr, ENTREZGENE_PREFIX)); + } else if (refStr.startsWith(GENEATLAS_PREFIX)) { + return new GenAtlasId(StringUtil.removePrefix(refStr, GENEATLAS_PREFIX)); + } else if (refStr.startsWith(GENECARD_PREFIX)) { + return new GeneCardId(StringUtil.removePrefix(refStr, GENECARD_PREFIX)); + } else if (refStr.startsWith(GO_PREFIX)) { + return new GeneOntologyID(StringUtil.removePrefix(refStr, GO_PREFIX)); + } else if (refStr.startsWith(HGNC_PREFIX)) { + return new HgncID(StringUtil.removePrefix(refStr, HGNC_PREFIX)); + } else if (refStr.startsWith(HUGE_PREFIX)) { + return new HugeId(StringUtil.removePrefix(refStr, HUGE_PREFIX)); + } else if (refStr.startsWith(IUPHAR_RECEPTOR_PREFIX)) { + return new IupharReceptorId(StringUtil.removePrefix(refStr, IUPHAR_RECEPTOR_PREFIX)); + } else if (refStr.startsWith(MODBASE_PREFIX)) { + return new ModBaseId(StringUtil.removePrefix(refStr, MODBASE_PREFIX)); + } else if (refStr.startsWith(MUTDB_PREFIX)) { + return new MutDbId(StringUtil.removePrefix(refStr, MUTDB_PREFIX)); + } else if (refStr.startsWith(OMIM_PREFIX)) { + return new OmimID(StringUtil.removePrefix(refStr, OMIM_PREFIX)); + } else if (refStr.startsWith(REFSEQDNA_PREFIX)) { + return NucleotideAccessionResolver.resolveNucleotideAccession(StringUtil.removePrefix(refStr, + REFSEQDNA_PREFIX)); + } else if (refStr.startsWith(REFSEQRNA_PREFIX)) { + return new RefSeqID(StringUtil.removePrefix(refStr, REFSEQRNA_PREFIX)); + } else if (refStr.startsWith(REFSEQPROTEIN_PREFIX)) { + return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(refStr, + REFSEQPROTEIN_PREFIX)); + } else if (refStr.startsWith(UCSCGENOMEBROWSER_PREFIX)) { + return new UcscGenomeBrowserId(StringUtil.removePrefix(refStr, UCSCGENOMEBROWSER_PREFIX)); + } else if (refStr.startsWith(UNIPROT_PREFIX)) { + return new UniProtID(StringUtil.removePrefix(refStr, UNIPROT_PREFIX)); + } else if (refStr.startsWith(URL_PREFIX)) { + return new CrossReferenceUrl(StringUtil.removePrefix(refStr, URL_PREFIX)); + } else { + return new UnknownDataSourceIdentifier(refStr, null); + } + } catch (IllegalArgumentException e) { + return new ProbableErrorDataSourceIdentifier(refStr, null, e.getMessage()); + } + } + +} diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java index 146c61c..7159666 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java @@ -40,9 +40,9 @@ import java.util.Collection; import java.util.HashSet; import java.util.Set; - + import org.apache.log4j.Logger; - + import edu.ucdenver.ccp.common.file.CharacterEncoding; import edu.ucdenver.ccp.common.file.reader.Line; import edu.ucdenver.ccp.common.string.RegExPatterns; @@ -51,6 +51,7 @@ import edu.ucdenver.ccp.common.string.StringUtil.RemoveFieldEnclosures; import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ncbi.RefSnpID; import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbHaplotypeId; import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; @@ -127,7 +128,7 @@ private Set> resolveEntityId(String idStr, String entity } else if (entityType.equals(ENTITY_TYPE_VARIANT_LOCATION)) { ids.add(new PharmGkbVariantLocationId(id)); } else { - logger.warn("Unhandled PharmGkb entity type detected: " + idStr + " type = " + entityType); + ids.add(new UnknownDataSourceIdentifier(id, null)); } } return ids; diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java index 0bd49a0..94e8bb2 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java @@ -41,10 +41,16 @@ import edu.ucdenver.ccp.common.file.reader.Line; import edu.ucdenver.ccp.common.file.reader.StreamLineReader; import edu.ucdenver.ccp.common.ftp.FTPUtil.FileType; +import edu.ucdenver.ccp.common.string.StringUtil; import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecordReader; -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdResolver; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; +import edu.ucdenver.ccp.datasource.identifiers.hgnc.HgncID; +import edu.ucdenver.ccp.datasource.identifiers.mgi.MgiGeneID; import edu.ucdenver.ccp.datasource.identifiers.obo.ProteinOntologyId; +import edu.ucdenver.ccp.datasource.identifiers.rgd.RgdID; /** * File parser for Protein Ongology promapping.txt file. @@ -95,8 +101,8 @@ protected StreamLineReader initializeLineReaderFromDownload(CharacterEncoding en * (non-Javadoc) * * @see - * edu.ucdenver.ccp.fileparsers.SingleLineFileRecordReader#parseRecordFromLine(edu.ucdenver. - * ccp.common.file.reader.LineReader.Line) + * edu.ucdenver.ccp.fileparsers.SingleLineFileRecordReader#parseRecordFromLine + * (edu.ucdenver. ccp.common.file.reader.LineReader.Line) */ @Override protected ProMappingRecord parseRecordFromLine(Line line) { @@ -106,11 +112,8 @@ protected ProMappingRecord parseRecordFromLine(Line line) { if (text.startsWith("PR:")) { String[] tokens = text.split("\t"); - ProteinOntologyId fromId = (ProteinOntologyId) DataSourceIdResolver.resolveId(tokens[0].trim()); - if (tokens[1].trim().startsWith("UniProtKB_VAR")) - return null; - - DataSourceIdentifier targetId = DataSourceIdResolver.resolveId(tokens[1].trim()); + ProteinOntologyId fromId = new ProteinOntologyId(tokens[0].trim()); + DataSourceIdentifier targetId = resolveId(tokens[1].trim()); String mappingType = tokens[2].trim(); r = new ProMappingRecord(fromId, targetId, mappingType, line.getByteOffset(), line.getLineNumber()); @@ -118,4 +121,24 @@ protected ProMappingRecord parseRecordFromLine(Line line) { return r; } + + private DataSourceIdentifier resolveId(String idStr) { + try { + if (idStr.startsWith("MGI:")) { + return new MgiGeneID(idStr); + } + if (idStr.startsWith("RGD:")) { + return new RgdID(StringUtil.removePrefix(idStr, "RGD:")); + } + if (idStr.startsWith("HGNC:")) { + return new HgncID(idStr); + } + if (idStr.startsWith("UniProtKB:")) { + return new UniProtID(StringUtil.removePrefix(idStr, "UniProtKB:")); + } + } catch (IllegalArgumentException e) { + return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); + } + return new UnknownDataSourceIdentifier(idStr, null); + } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java index 69b5c08..2cd990f 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java @@ -37,6 +37,7 @@ import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.IdResolver; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.omim.OmimID; @@ -48,7 +49,8 @@ import edu.ucdenver.ccp.identifier.publication.PubMedID; /** - * @author Center for Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu + * @author Center for Computational Pharmacology, UC Denver; + * ccpsupport@ucdenver.edu * */ public class RgdAnnotationFileIdResolver implements IdResolver { @@ -62,7 +64,8 @@ public DataSourceIdentifier resolveId(String idStr) { return null; } if (idStr.matches("[Rr][Gg][Dd][:;]\\d+")) { - // there are instances with mixed case, e.g. RGd: and with semi-colons instead of colons + // there are instances with mixed case, e.g. RGd: and with + // semi-colons instead of colons return new RgdID(idStr.substring(4)); } if (idStr.matches("[Rr][Gg][Dd]\\d+")) { @@ -82,7 +85,8 @@ public DataSourceIdentifier resolveId(String idStr) { return new RgdID(idStr.substring(4)); } if (idStr.matches("\\d+")) { - // there are a few typos where the "RGD:" prefix is missing, e.g. 1550157 + // there are a few typos where the "RGD:" prefix is missing, e.g. + // 1550157 return new RgdID(idStr); } if (idStr.matches("MP:\\d+")) { @@ -108,27 +112,29 @@ public DataSourceIdentifier resolveId(String idStr) { } if (idStr.matches("rno:\\d+")) { logger.warn("Ignoring RNO identifier: " + idStr + ". Not sure what this references..."); - // not sure what this is.. could be a kegg gene? it's used in the withOrFrom column + // not sure what this is.. could be a kegg gene? it's used in the + // withOrFrom column return null; } if (idStr.startsWith("UniProtKB:")) { return new UniProtID(idStr.substring(10)); } - throw new IllegalArgumentException("Unhandled ID type: " + idStr); + return new UnknownDataSourceIdentifier(idStr, null); } /* * (non-Javadoc) * - * @see edu.ucdenver.ccp.datasource.identifiers.IdResolver#resolveId(java.lang.String, - * java.lang.String) + * @see + * edu.ucdenver.ccp.datasource.identifiers.IdResolver#resolveId(java.lang + * .String, java.lang.String) */ @Override public DataSourceIdentifier resolveId(String db, String id) { if (db.equals("RGD") && id.matches("\\d+")) { return new RgdID(id); } - throw new IllegalArgumentException("Unhandled ID type -- db:" + db + " id: " + id); + return new UnknownDataSourceIdentifier(id, db); } } diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java index 79a4824..2994431 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java @@ -48,6 +48,7 @@ import edu.ucdenver.ccp.common.file.CharacterEncoding; import edu.ucdenver.ccp.datasource.fileparsers.test.RecordReaderTester; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.bind.BindInteractionID; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtEntryName; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; @@ -94,22 +95,28 @@ public void testParser() throws IOException { * uniprotkb:P38276|refseq:NP_009695|entrezgene/locuslink * :852434|rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292|irogid:16835195 * uniprotkb:P38276|refseq - * :NP_009695|entrezgene/locuslink:852434|rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 - * |irogid:16835195 - * uniprotkb:YBY7_YEAST|entrezgene/locuslink:YBR137W|crogid:UsO9ZYVJXLI50JBd + * :NP_009695|entrezgene/locuslink:852434|rogid + * :UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 |irogid:16835195 + * uniprotkb:YBY7_YEAST + * |entrezgene/locuslink:YBR137W|crogid:UsO9ZYVJXLI50JBd * /g0C1NtSeXI559292|icrogid:16835195 * uniprotkb:YBY7_YEAST|entrezgene/locuslink:YBR137W|crogid - * :UsO9ZYVJXLI50JBd/g0C1NtSeXI559292|icrogid:16835195 MI:0018(2 hybrid) - - * pubmed:10655498 taxid:559292(Saccharomyces cerevisiae S288c) - * taxid:559292(Saccharomyces cerevisiae S288c) - MI:0000(BIND_Translation) - * BIND_Translation:1261|rigid:+++94o2VtVJcuk6jD3H2JZXaVYc|irigid:617101|edgetype:X - * lpr:4518|hpr:5191|np:2 none MI:0000(unspecified) MI:0000(unspecified) - * MI:0000(unspecified) MI:0000(unspecified) MI:0326(protein) MI:0326(protein) - - - - - - * - - - 2010/05/18 2010/05/18 rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 - * rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 rigid:+++94o2VtVJcuk6jD3H2JZXaVYc false - * refseq:NP_009695 refseq:NP_009695 refseq:NP_009695 refseq:NP_009695 P P 16835195 - * 16835195 617101 UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 - * +++94o2VtVJcuk6jD3H2JZXaVYc 16835195 16835195 617101 - X 2 + * :UsO9ZYVJXLI50JBd/g0C1NtSeXI559292|icrogid:16835195 MI:0018(2 + * hybrid) - pubmed:10655498 taxid:559292(Saccharomyces cerevisiae + * S288c) taxid:559292(Saccharomyces cerevisiae S288c) - + * MI:0000(BIND_Translation) + * BIND_Translation:1261|rigid:+++94o2VtVJcuk6jD3H2JZXaVYc + * |irigid:617101|edgetype:X lpr:4518|hpr:5191|np:2 none + * MI:0000(unspecified) MI:0000(unspecified) MI:0000(unspecified) + * MI:0000(unspecified) MI:0326(protein) MI:0326(protein) - - - - - + * - - - 2010/05/18 2010/05/18 + * rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 + * rogid:UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 + * rigid:+++94o2VtVJcuk6jD3H2JZXaVYc false refseq:NP_009695 + * refseq:NP_009695 refseq:NP_009695 refseq:NP_009695 P P 16835195 + * 16835195 617101 UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 + * UsO9ZYVJXLI50JBd/g0C1NtSeXI559292 +++94o2VtVJcuk6jD3H2JZXaVYc + * 16835195 16835195 617101 - X 2 */ parser.next(); @@ -122,23 +129,29 @@ public void testParser() throws IOException { /* * rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 uniprotkb:P05132 * PDB:1YDT_I|PDB:1YDR_I|PDB:1YDS_I - * |PDB:1FMO_I|PDB:1STC_I|rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796|irogid:9981084 - * uniprotkb: - * P05132|refseq:NP_032880|entrezgene/locuslink:18747|rogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 + * |PDB:1FMO_I|PDB:1STC_I|rogid:Ivetsb7L + * /rt8ds+TyhtJZKxTtVE9796|irogid:9981084 uniprotkb: + * P05132|refseq:NP_032880 + * |entrezgene/locuslink:18747|rogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 * |irogid:2201887 - * rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796|crogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 - * |icrogid:9981084|- - * uniprotkb:KAPCA_MOUSE|entrezgene/locuslink:Prkaca|crogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 - * |icrogid:2201887 MI:0114(three-dimensional-structure) - pubmed:1862342 - * taxid:9796(Equus caballus) taxid:10090(Mus musculus) - MI:0462(bind) - * bind:76262|rigid:++f9f/9TQhDLvdrGu56SalIhHSA|irigid:617146|edgetype:X - * lpr:1|hpr:6|np:6 none MI:0000(unspecified) MI:0000(unspecified) MI:0000(unspecified) - * MI:0000(unspecified) MI:0326(protein) MI:0326(protein) - - - - - - - - 2010/05/18 - * 2010/05/18 rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 - * rogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 rigid:++f9f/9TQhDLvdrGu56SalIhHSA false - * GenBank:"1FMO_I" GenBank:NP_032880 PDB:1FMO_I refseq:NP_032880 PT P 9981084 2201887 - * 617146 Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 - * ++f9f/9TQhDLvdrGu56SalIhHSA 9981084 2201887 617146 - X 2 + * rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796|crogid:Ivetsb7L + * /rt8ds+TyhtJZKxTtVE9796 |icrogid:9981084|- + * uniprotkb:KAPCA_MOUSE|entrezgene + * /locuslink:Prkaca|crogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 + * |icrogid:2201887 MI:0114(three-dimensional-structure) - + * pubmed:1862342 taxid:9796(Equus caballus) taxid:10090(Mus + * musculus) - MI:0462(bind) + * bind:76262|rigid:++f9f/9TQhDLvdrGu56SalIhHSA + * |irigid:617146|edgetype:X lpr:1|hpr:6|np:6 none + * MI:0000(unspecified) MI:0000(unspecified) MI:0000(unspecified) + * MI:0000(unspecified) MI:0326(protein) MI:0326(protein) - - - - - + * - - - 2010/05/18 2010/05/18 rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 + * rogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 + * rigid:++f9f/9TQhDLvdrGu56SalIhHSA false GenBank:"1FMO_I" + * GenBank:NP_032880 PDB:1FMO_I refseq:NP_032880 PT P 9981084 + * 2201887 617146 Ivetsb7L/rt8ds+TyhtJZKxTtVE9796 + * HdW51RuiujpUxo0Fu8TbWz3Yk8c10090 ++f9f/9TQhDLvdrGu56SalIhHSA + * 9981084 2201887 617146 - X 2 */ IRefWebPsiMitab2_6FileData record = parser.next(); @@ -175,9 +188,12 @@ public void testParser() throws IOException { expectedAliasesB.add(new IcrogId("2201887")); assertEquals(expectedAliasesB, record.getInteractorB().getAliasIds()); - assertEmpty(record.getInteractorA().getAliasSymbols()); + assertEquals(CollectionsUtil.createSet("rogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796", + "crogid:Ivetsb7L/rt8ds+TyhtJZKxTtVE9796", "icrogid:9981084"), record.getInteractorA() + .getAliasSymbols()); - Set expectedAliasBSymbols = CollectionsUtil.createSet(new String("Prkaca")); + Set expectedAliasBSymbols = CollectionsUtil.createSet("crogid:HdW51RuiujpUxo0Fu8TbWz3Yk8c10090", + "entrezgene/locuslink:Prkaca", "icrogid:2201887", "uniprotkb:KAPCA_MOUSE"); assertEquals(expectedAliasBSymbols, record.getInteractorB().getAliasSymbols()); assertEquals(new IRefWebInteractionDetectionMethod(new MolecularInteractionOntologyTermID("MI:0114"), @@ -189,12 +205,13 @@ public void testParser() throws IOException { assertEquals(new IRefWebInteractorOrganism(new NcbiTaxonomyID(9796), "Equus caballus"), record .getInteractorA().getNcbiTaxonomyId()); - assertEquals(new IRefWebInteractorOrganism(new NcbiTaxonomyID(10090), "Mus musculus"), record.getInteractorB() - .getNcbiTaxonomyId()); + assertEquals(new IRefWebInteractorOrganism(new NcbiTaxonomyID(10090), "Mus musculus"), record + .getInteractorB().getNcbiTaxonomyId()); assertNull(record.getInteraction().getInteractionType()); - assertEquals(new IRefWebInteractionSourceDatabase(new MolecularInteractionOntologyTermID("MI:0462"), "bind"), + assertEquals( + new IRefWebInteractionSourceDatabase(new MolecularInteractionOntologyTermID("MI:0462"), "bind"), record.getSourceDb()); Set> expectedInteractionDbIds = new HashSet>(); @@ -208,55 +225,57 @@ public void testParser() throws IOException { assertEquals("none", record.getInteraction().getExpansion()); - assertEquals(new IRefWebInteractorBiologicalRole(new MolecularInteractionOntologyTermID("MI:0000"), "unspecified"), - record.getInteractorA().getBiologicalRole()); - assertEquals(new IRefWebInteractorBiologicalRole(new MolecularInteractionOntologyTermID("MI:0000"), "unspecified"), - record.getInteractorB().getBiologicalRole()); - assertEquals(new IRefWebInteractorExperimentalRole(new MolecularInteractionOntologyTermID("MI:0000"), "unspecified"), - record.getInteractorA().getExperimentalRole()); - assertEquals(new IRefWebInteractorExperimentalRole(new MolecularInteractionOntologyTermID("MI:0000"), "unspecified"), - record.getInteractorB().getExperimentalRole()); + assertEquals(new IRefWebInteractorBiologicalRole(new MolecularInteractionOntologyTermID("MI:0000"), + "unspecified"), record.getInteractorA().getBiologicalRole()); + assertEquals(new IRefWebInteractorBiologicalRole(new MolecularInteractionOntologyTermID("MI:0000"), + "unspecified"), record.getInteractorB().getBiologicalRole()); + assertEquals(new IRefWebInteractorExperimentalRole(new MolecularInteractionOntologyTermID("MI:0000"), + "unspecified"), record.getInteractorA().getExperimentalRole()); + assertEquals(new IRefWebInteractorExperimentalRole(new MolecularInteractionOntologyTermID("MI:0000"), + "unspecified"), record.getInteractorB().getExperimentalRole()); - assertEquals(new IRefWebInteractorType(new MolecularInteractionOntologyTermID("MI:0326"), "protein"), record - .getInteractorA().getInteractorType()); - assertEquals(new IRefWebInteractorType(new MolecularInteractionOntologyTermID("MI:0326"), "protein"), record - .getInteractorB().getInteractorType()); + assertEquals(new IRefWebInteractorType(new MolecularInteractionOntologyTermID("MI:0326"), "protein"), + record.getInteractorA().getInteractorType()); + assertEquals(new IRefWebInteractorType(new MolecularInteractionOntologyTermID("MI:0326"), "protein"), + record.getInteractorB().getInteractorType()); assertNull(record.getInteraction().getHostOrgTaxonomyId()); - + assertEquals("2010/05/18", record.getCreationDate()); assertEquals("2010/05/18", record.getUpdateDate()); - + assertEquals(new RogId("Ivetsb7L/rt8ds+TyhtJZKxTtVE9796"), record.getInteractorA().getChecksum()); assertEquals(new RogId("HdW51RuiujpUxo0Fu8TbWz3Yk8c10090"), record.getInteractorB().getChecksum()); assertEquals(new RigId("++f9f/9TQhDLvdrGu56SalIhHSA"), record.getInteraction().getChecksumInteraction()); - + assertFalse(record.getInteraction().isNegative()); - - assertNull(record.getInteractorA().getOriginalReference()); + + assertEquals(new ProbableErrorDataSourceIdentifier("\"1FMO_I\"", null, + "Input is not a known protein accession pattern: \"1FMO_I\""), record.getInteractorA() + .getOriginalReference()); assertEquals(new RefSeqID("NP_032880"), record.getInteractorB().getOriginalReference()); assertEquals(new PdbID("1FMO_I"), record.getInteractorA().getFinalReference()); assertEquals(new RefSeqID("NP_032880"), record.getInteractorB().getFinalReference()); - + assertEquals("PT", record.getInteractorA().getMappingScore()); assertEquals("P", record.getInteractorB().getMappingScore()); - + assertEquals(new IrogId("9981084"), record.getInteractorA().getIrogid()); assertEquals(new IrogId("2201887"), record.getInteractorB().getIrogid()); assertEquals(new IrigId("617146"), record.getInteraction().getIrigid()); - + assertEquals(new CrogId("Ivetsb7L/rt8ds+TyhtJZKxTtVE9796"), record.getInteractorA().getCrogid()); assertEquals(new CrogId("HdW51RuiujpUxo0Fu8TbWz3Yk8c10090"), record.getInteractorB().getCrogid()); assertEquals(new CrigId("++f9f/9TQhDLvdrGu56SalIhHSA"), record.getInteraction().getCrigid()); - + assertEquals(new IcrogId("9981084"), record.getInteractorA().getIcrogid()); assertEquals(new IcrogId("2201887"), record.getInteractorB().getIcrogid()); assertEquals(new IcrigId("617146"), record.getInteraction().getIcrigid()); - + assertNull(record.getInteraction().getImexId()); - + assertEquals("X", record.getInteraction().getEdgeType()); - + assertEquals(2, record.getInteraction().getNumParticipants()); } else { diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java index 8083235..8c9f3f9 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParserTest.java @@ -33,14 +33,22 @@ * #L% */ +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.fail; + import java.io.IOException; +import java.util.NoSuchElementException; -import org.junit.Ignore; import org.junit.Test; import edu.ucdenver.ccp.common.file.CharacterEncoding; -import edu.ucdenver.ccp.datasource.fileparsers.RecordReader; import edu.ucdenver.ccp.datasource.fileparsers.test.RecordReaderTester; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; +import edu.ucdenver.ccp.datasource.identifiers.hgnc.HgncID; +import edu.ucdenver.ccp.datasource.identifiers.obo.ProteinOntologyId; /** * @@ -57,16 +65,60 @@ protected String getSampleFileName() { } @Override - protected RecordReader initSampleRecordReader() throws IOException { + protected ProMappingFileParser initSampleRecordReader() throws IOException { return new ProMappingFileParser(sampleInputFile, CharacterEncoding.US_ASCII); } - @Ignore("Test not yet implemented.. ") @Test - public void testParser() { - + public void testParser() throws IOException { + ProMappingFileParser parser = initSampleRecordReader(); + + if (parser.hasNext()) { + validateRecord1(parser.next()); + } else { + fail("Parser should have returned a record here."); + } + + if (parser.hasNext()) { + validateRecord2(parser.next()); + } else { + fail("Parser should have returned a record here."); + } + + if (parser.hasNext()) { + validateRecord3(parser.next()); + } else { + fail("Parser should have returned a record here."); + } + assertFalse(parser.hasNext()); + + try { + parser.next(); + fail("Should have thrown a NoSuchElementException."); + } catch (NoSuchElementException nsee) { + // do nothing, exception expected + } + + } + + private void validateRecord(ProMappingRecord record, ProteinOntologyId expectedPrId, String expectedMappingType, + DataSourceIdentifier expectedTargetId) { + assertEquals(expectedPrId, record.getProteinOntologyId()); + assertEquals(expectedMappingType, record.getMappingType()); + assertEquals(expectedTargetId, record.getTargetRecordId()); + } + + private void validateRecord1(ProMappingRecord record) { + validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new HgncID("HGNC:11773")); + } + + private void validateRecord2(ProMappingRecord record) { + validateRecord(record, new ProteinOntologyId("PR:000000005"), "is_a", new UnknownDataSourceIdentifier( + "UniProtKB_VAR:VAR_022359", null)); + } + private void validateRecord3(ProMappingRecord record) { + validateRecord(record, new ProteinOntologyId("PR:000000006"), "exact", new UniProtID("P37173")); } - } diff --git a/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/pro/PRO_promapping.txt b/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/pro/PRO_promapping.txt index 8f1dbf7..dab78df 100644 --- a/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/pro/PRO_promapping.txt +++ b/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/pro/PRO_promapping.txt @@ -1,3 +1,3 @@ PR:000000005 HGNC:11773 is_a -PR:000000005 MGI:98729 is_a +PR:000000005 UniProtKB_VAR:VAR_022359 is_a PR:000000006 UniProtKB:P37173 exact \ No newline at end of file diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java index e13efa2..a214cd7 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java @@ -372,7 +372,19 @@ public String getLocalName() { KIAODIP("http://kabob.ucdenver.edu/iao/dip/"), KIAOIREFWEB("http://kabob.ucdenver.edu/iao/irefweb/"), KIAOEMBL("http://kabob.ucdenver.edu/iao/embl/"), - KRO("http://kabob.ucdenver.edu/ro/"); + KRO("http://kabob.ucdenver.edu/ro/"), + + /** + * to be used for data source identifiers whose source is unknown or not yet modeled. + */ + UNKNOWN(null), + /** + * to be used for data source identifiers that are thought to be incorrect, e.g. + * a UniProt ID that doesn't match the expected regular expression or an NCBI Gene + * ID that is not an integer. + */ + PROBABLE_ERROR(null); + public final String longName; DataSource(String longName) { diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceElement.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceElement.java index 00d1229..3c370b6 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceElement.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceElement.java @@ -45,7 +45,7 @@ public abstract class DataSourceElement { /** * raw data element */ - private T dataElement; + protected T dataElement; /** * Default constructor. diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java index fdbc8c6..08de587 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java @@ -132,16 +132,18 @@ import edu.ucdenver.ccp.identifier.publication.PubMedID; /** - * provides various methods to map from an ID in database or ontology files to instances of - * identifier classes under edu.ucdenver.ccp.datasource.identifiers. + * provides various methods to map from an ID in database or ontology files to + * instances of identifier classes under + * edu.ucdenver.ccp.datasource.identifiers. * - * These are basically factory methods. Given some information about where the ID came from and an - * ID string, it creates an instance of an identifier class related to the source. This is done for - * DataSourceIdentifiers, PMID identifiers and others. + * These are basically factory methods. Given some information about where the + * ID came from and an ID string, it creates an instance of an identifier class + * related to the source. This is done for DataSourceIdentifiers, PMID + * identifiers and others. * - * Three functions named resolveId(): - a value of the DataSource enum and an ID string. - a name of - * a data source and and ID string. - an ID string that is parsed to discover the data source it - * came from. + * Three functions named resolveId(): - a value of the DataSource enum and an ID + * string. - a name of a data source and and ID string. - an ID string that is + * parsed to discover the data source it came from. **/ public class DataSourceIdResolver { @@ -210,7 +212,8 @@ public static DataSourceIdentifier resolveId(DataSource dataSource, String da } - // TODO: remove this method and replace its use with resolveId(DataSource, String) + // TODO: remove this method and replace its use with resolveId(DataSource, + // String) public static DataSourceIdentifier resolveId(String databaseName, String databaseObjectID) { if (databaseName.equalsIgnoreCase("MGI")) return new MgiGeneID(databaseObjectID); @@ -290,8 +293,9 @@ else if (databaseName.equalsIgnoreCase("url")) { || databaseName.equalsIgnoreCase("GenBank Protein Database")) return new GenBankID(databaseObjectID); - logger.warn("Unable to resolve data source identifier: datasource=" + databaseName + " id=" + databaseObjectID); - return null; + logger.warn("Unable to resolve data source identifier: datasource=" + databaseName + " id=" + databaseObjectID + + ". Using UnknownDataSourceIdentifier."); + return new UnknownDataSourceIdentifier(databaseObjectID, databaseName); } /** @@ -445,14 +449,13 @@ else if (geneIDStr.startsWith("CL:")) else if (geneIDStr.startsWith("NCBITaxon:")) return new NcbiTaxonomyID(StringUtil.removePrefix(geneIDStr, "NCBITaxon:")); - logger.error(String - .format("Unknown gene ID format: %s. Cannot create DataElementIdentifier.", geneIDStr)); - + logger.warn(String.format("Unhandled gene ID format: %s. Creating UnknownDataSourceIdentifier.", geneIDStr)); + return new UnknownDataSourceIdentifier(geneIDStr, null); } catch (IllegalArgumentException e) { - logger.warn("Invalid ID detected... " + e.getMessage()); + logger.warn("Invalid ID detected... " + e.getMessage()); + return new ProbableErrorDataSourceIdentifier(geneIDStr, null, e.getMessage()); } - return null; } /** @@ -460,21 +463,23 @@ else if (geneIDStr.startsWith("NCBITaxon:")) * * @param interactionIDStr * id to resolve - * @return identifier if argument is resolvable and supported; otherwise, return null. + * @return identifier if argument is resolvable and supported; otherwise, + * return null. */ private static DataSourceIdentifier resolveInteractionID(String interactionIDStr) { - if (interactionIDStr.startsWith("intact:")) + if (interactionIDStr.startsWith("intact:")) { return new IntActID(StringUtil.removePrefix(interactionIDStr, "intact:")); - else if (interactionIDStr.startsWith("bind:")) + } else if (interactionIDStr.startsWith("bind:")) { return new BindInteractionID(StringUtil.removePrefix(interactionIDStr, "bind:")); - else if (interactionIDStr.startsWith("grid:")) + } else if (interactionIDStr.startsWith("grid:")) { return new BioGridID(StringUtil.removePrefix(interactionIDStr, "grid:")); - else if (interactionIDStr.startsWith("mint:")) + } else if (interactionIDStr.startsWith("mint:")) { return new MintID(StringUtil.removePrefix(interactionIDStr, "mint:")); + } - logger.error(String.format("Unknown interaction ID format: %s. Cannot create DataElementIdentifier.", + logger.warn(String.format("Unknown interaction ID format: %s. Cannot create DataElementIdentifier.", interactionIDStr)); - return null; + return new UnknownDataSourceIdentifier(interactionIDStr, null); } /** @@ -482,8 +487,8 @@ else if (interactionIDStr.startsWith("mint:")) * * @param interactionIDStrs * ids to resolve - * @return identifier if all members of interactionIDStrs are resolvable and - * supported; otherwise, return null. + * @return identifier if all members of interactionIDStrs are + * resolvable and supported; otherwise, return null. */ public static Set> resolveInteractionIDs(Set interactionIDStrs) { Set> interactionIDs = new HashSet>(); @@ -500,26 +505,29 @@ public static Set> resolveInteractionIDs(Set int * Resolve Pubmed ID from value that starts with prefix 'pubmed:'. * * @param pmidStr - * @return id if value following prefix is a positive integer; otherwise, null + * @return id if value following prefix is a positive integer; otherwise, + * null */ - public static PubMedID resolvePubMedID(String pmidStr) { + public static DataSourceIdentifier resolvePubMedID(String pmidStr) { String prefix = "pubmed:"; if (pmidStr.startsWith(prefix)) { String id = StringUtil.removePrefix(pmidStr, prefix); - if (StringUtil.isIntegerGreaterThanZero(id)) + if (StringUtil.isIntegerGreaterThanZero(id)) { return new PubMedID(id); + } } - logger.error(String.format("Unknown PubMed ID format: %s. Cannot create PubMedID.", pmidStr)); - return null; + logger.warn(String.format("Unknown PubMed ID format: %s. Cannot create PubMedID.", pmidStr)); + return new ProbableErrorDataSourceIdentifier(pmidStr, null, "Invalid PubMedID, must be an integer."); } - public static Set resolvePubMedIDs(Set pmidStrs) { - Set pmids = new HashSet(); + public static Set> resolvePubMedIDs(Set pmidStrs) { + Set> pmids = new HashSet>(); for (String pmidStr : pmidStrs) { - PubMedID id = resolvePubMedID(pmidStr); - if (id == null) + DataSourceIdentifier id = resolvePubMedID(pmidStr); + if (id == null) { return null; + } pmids.add(id); } @@ -530,8 +538,9 @@ public static Set> resolveIds(Set databaseObject Set> databaseObjectIDs = new HashSet>(); for (String databaseObjectIDStr : databaseObjectIDStrs) { DataSourceIdentifier id = resolveId(databaseObjectIDStr); - if (id != null) + if (id != null) { databaseObjectIDs.add(id); + } } return databaseObjectIDs; } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdentifier.java index b760dcf..a9cecd0 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdentifier.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdentifier.java @@ -43,7 +43,7 @@ */ public abstract class DataSourceIdentifier extends DataSourceElement { - private final DataSource dataSource; + protected final DataSource dataSource; /** * Default constructor. diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java index 8c7e9dc..f192f76 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java @@ -43,6 +43,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.log4j.Logger; + import edu.ucdenver.ccp.common.collections.CollectionsUtil; import edu.ucdenver.ccp.datasource.identifiers.ebi.embl.EmblID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.GenBankID; @@ -54,11 +56,14 @@ * Resolution of accession identifiers based on prefixes available here: * http://www.ncbi.nlm.nih.gov/Sequin/acc.html * - * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu + * @author Colorado Computational Pharmacology, UC Denver; + * ccpsupport@ucdenver.edu * */ public class NucleotideAccessionResolver { + private static final Logger logger = Logger.getLogger(NucleotideAccessionResolver.class); + private static final Pattern ACC_PATTERN = Pattern.compile("([A-Z]+)\\d+\\.?\\d*"); private static final Set GENBANK_ID_PREFIXES = CollectionsUtil.createSet("CH", "CM", "DS", "EM", "EN", @@ -136,7 +141,8 @@ public static DataSourceIdentifier resolveNucleotideAccession(String acc } } } - throw new IllegalArgumentException("Input is not a known nucleotide accession: " + acc); + logger.warn("Input is not a known nucleotide accession: " + acc); + return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known nucleotide accession: " + acc); } } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java new file mode 100644 index 0000000..f481ad8 --- /dev/null +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java @@ -0,0 +1,65 @@ +package edu.ucdenver.ccp.datasource.identifiers; + +public class ProbableErrorDataSourceIdentifier extends DataSourceIdentifier { + + private final String dataSourceStr; + private final String errorMessage; + + public ProbableErrorDataSourceIdentifier(String resourceID, String dataSourceStr, String errorMessage) { + super(resourceID, DataSource.PROBABLE_ERROR); + this.dataSourceStr = dataSourceStr; + this.errorMessage = errorMessage; + } + + @Override + public String validate(String resourceID) throws IllegalArgumentException { + return resourceID; + } + + public String getDataSourceStr() { + return dataSourceStr; + } + + public String getErrorMessage() { + return errorMessage; + } + + @Override + public String toString() { + return "ProbableErrorDataSourceIdentifier [dataSourceStr=" + dataSourceStr + ", errorMessage=" + errorMessage + + ", getDataElement()=" + getDataElement() + "]"; + } + + @Override + public int hashCode() { + final int prime = 31; + int result = super.hashCode(); + result = prime * result + ((dataSourceStr == null) ? 0 : dataSourceStr.hashCode()); + return result; + } + + @Override + public boolean equals(Object obj) { + if (this == obj) + return true; + if (!super.equals(obj)) + return false; + if (getClass() != obj.getClass()) + return false; + ProbableErrorDataSourceIdentifier other = (ProbableErrorDataSourceIdentifier) obj; + if (dataSourceStr == null) { + if (other.dataSourceStr != null) + return false; + } else if (!dataSourceStr.equals(other.dataSourceStr)) + return false; + return true; + } + + + + + + + + +} diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java index 33862f2..3827dc0 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java @@ -39,6 +39,8 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.log4j.Logger; + import edu.ucdenver.ccp.datasource.identifiers.ebi.embl.EmblID; import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.GenBankID; @@ -49,10 +51,13 @@ * Resolution of accession identifiers based on prefixes available here: * http://www.ncbi.nlm.nih.gov/Sequin/acc.html * - * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu + * @author Colorado Computational Pharmacology, UC Denver; + * ccpsupport@ucdenver.edu * */ public class ProteinAccessionResolver { + + private static final Logger logger = Logger.getLogger(ProteinAccessionResolver.class); private static final Pattern ACC_PATTERN = Pattern.compile("([A-Z]{3})\\d+\\.?\\d*"); private static final String VALID_UNIPROT_PATTERN_1 = "[A-NR-Z][0-9][A-Z][A-Z0-9][A-Z0-9][0-9]"; @@ -100,7 +105,9 @@ public static DataSourceIdentifier resolveProteinAccession(String acc) { return new GenBankID(acc); } } - throw new IllegalArgumentException("Input is not a known protein accession pattern: " + acc); + logger.warn("Input is not a known protein accession pattern: " + acc); + return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known protein accession pattern: " + + acc); } } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java new file mode 100644 index 0000000..65eeb4c --- /dev/null +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java @@ -0,0 +1,21 @@ +package edu.ucdenver.ccp.datasource.identifiers; + +public class UnknownDataSourceIdentifier extends DataSourceIdentifier { + + private final String dataSourceStr; + + public UnknownDataSourceIdentifier(String resourceID, String dataSourceStr) { + super(resourceID, DataSource.UNKNOWN); + this.dataSourceStr = dataSourceStr; + } + + @Override + public String validate(String resourceID) throws IllegalArgumentException { + return resourceID; + } + + public String getDataSourceStr() { + return dataSourceStr; + } + +} From f85466a2b162f7fc55a95f89327186527f8d4a78 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 15:40:05 -0700 Subject: [PATCH 13/36] Added handling for non-normalized (no URI) and erroneous identifiers Instead of being excluded from the output RDF they are now cataloged as either NonNormalizedIdentifierRecords or ErroneousIdentifierRecords. --- .../ProbableErrorDataSourceIdentifier.java | 34 ++ .../UnknownDataSourceIdentifier.java | 34 ++ .../rdf/ice/ErroneousIdentifierRecord.java | 72 ++++ .../ice/NonNormalizedIdentifierRecord.java | 64 +++ .../rdfizer/rdf/ice/RdfRecordUriFactory.java | 29 +- .../rdfizer/rdf/ice/RdfRecordUtil.java | 250 ++++++++---- .../rdfizer/rdf/ice/RdfRecordWriterImpl.java | 378 +++++++++--------- ...ImplErroneousAndUnknownIdentifierTest.java | 180 +++++++++ 8 files changed, 771 insertions(+), 270 deletions(-) create mode 100644 datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/ErroneousIdentifierRecord.java create mode 100644 datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/NonNormalizedIdentifierRecord.java create mode 100644 datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplErroneousAndUnknownIdentifierTest.java diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java index f481ad8..f0394aa 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProbableErrorDataSourceIdentifier.java @@ -1,5 +1,39 @@ package edu.ucdenver.ccp.datasource.identifiers; +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + public class ProbableErrorDataSourceIdentifier extends DataSourceIdentifier { private final String dataSourceStr; diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java index 65eeb4c..14a91db 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/UnknownDataSourceIdentifier.java @@ -1,5 +1,39 @@ package edu.ucdenver.ccp.datasource.identifiers; +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + public class UnknownDataSourceIdentifier extends DataSourceIdentifier { private final String dataSourceStr; diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/ErroneousIdentifierRecord.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/ErroneousIdentifierRecord.java new file mode 100644 index 0000000..e18baea --- /dev/null +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/ErroneousIdentifierRecord.java @@ -0,0 +1,72 @@ +package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice; + +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +import edu.ucdenver.ccp.datasource.fileparsers.Record; +import edu.ucdenver.ccp.datasource.fileparsers.RecordField; +import edu.ucdenver.ccp.datasource.identifiers.DataSource; + +@Record(dataSource = DataSource.KABOB) +public class ErroneousIdentifierRecord { + + @RecordField + private final String identifier; + + @RecordField + private final String datasource; + + @RecordField + private final String comment; + + public ErroneousIdentifierRecord(String identifier, String datasource, String comment) { + super(); + this.identifier = identifier; + this.datasource = datasource; + this.comment = comment; + } + + public String getIdentifier() { + return identifier; + } + + public String getDatasource() { + return datasource; + } + + public String getComment() { + return comment; + } + +} diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/NonNormalizedIdentifierRecord.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/NonNormalizedIdentifierRecord.java new file mode 100644 index 0000000..24cb1c5 --- /dev/null +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/NonNormalizedIdentifierRecord.java @@ -0,0 +1,64 @@ +package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice; + +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +import edu.ucdenver.ccp.datasource.fileparsers.Record; +import edu.ucdenver.ccp.datasource.fileparsers.RecordField; +import edu.ucdenver.ccp.datasource.identifiers.DataSource; + +@Record(dataSource = DataSource.KABOB) +public class NonNormalizedIdentifierRecord { + + @RecordField + private final String identifier; + + @RecordField + private final String datasource; + + public NonNormalizedIdentifierRecord(String identifier, String datasource) { + super(); + this.identifier = identifier; + this.datasource = datasource; + } + + public String getIdentifier() { + return identifier; + } + + public String getDatasource() { + return datasource; + } + +} diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUriFactory.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUriFactory.java index 2a5b87c..ff8d7a4 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUriFactory.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUriFactory.java @@ -44,15 +44,19 @@ import java.util.Map.Entry; import java.util.Set; +import org.openrdf.model.Statement; import org.openrdf.model.Value; import org.openrdf.model.impl.URIImpl; import org.openrdf.rio.ntriples.NTriplesUtil; +import edu.ucdenver.ccp.common.collections.CollectionsUtil; import edu.ucdenver.ccp.common.digest.DigestUtil; import edu.ucdenver.ccp.common.reflection.PrivateAccessor; import edu.ucdenver.ccp.datasource.fileparsers.RecordField; import edu.ucdenver.ccp.datasource.fileparsers.RecordUtil; import edu.ucdenver.ccp.datasource.identifiers.DataSource; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.rdfizer.rdf.vocabulary.KIAO; /** @@ -178,6 +182,30 @@ private static List getSortedFieldValueUriStrs(Collection fieldV * could be a collection, if so we return one string per value */ private static String getFieldValueUri(Object fieldValue) { + /* address unknown and probable error data source identifiers here? */ + if (fieldValue instanceof UnknownDataSourceIdentifier) { + UnknownDataSourceIdentifier id = (UnknownDataSourceIdentifier) fieldValue; + NonNormalizedIdentifierRecord record = new NonNormalizedIdentifierRecord(id.getDataElement(), id.getDataSourceStr()); + URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record); + List recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, System.currentTimeMillis(), + recordUri, null, null, null); + recordInstanceStatements.remove(0); + /* this is used to generate sha1 hashes, so it doesn't need to be a true uri */ + return CollectionsUtil.createDelimitedString(recordInstanceStatements, " "); + } else if (fieldValue instanceof ProbableErrorDataSourceIdentifier) { + ProbableErrorDataSourceIdentifier id = (ProbableErrorDataSourceIdentifier) fieldValue; + ErroneousIdentifierRecord record = new ErroneousIdentifierRecord(id.getDataElement(), + id.getDataSourceStr(), id.getErrorMessage()); + URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record); + List recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, System.currentTimeMillis(), + recordUri, null, null, null); + /* + * the first statement returned is a dataset has_part record triple + * which we do not need + */ + recordInstanceStatements.remove(0); + return CollectionsUtil.createDelimitedString(recordInstanceStatements, " "); + } Value value = RdfUtil.getValue(fieldValue); return NTriplesUtil.toNTriplesString(value); } @@ -224,7 +252,6 @@ private static Collection getFieldValues(Object record, Field field) { return null; } - int fieldCount = 0; Collection fieldValues = new ArrayList(); if (!(fieldValue instanceof Collection)) { diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtil.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtil.java index 1e2b49b..d7e0fe6 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtil.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordUtil.java @@ -53,6 +53,7 @@ import org.openrdf.model.impl.StatementImpl; import org.openrdf.model.impl.URIImpl; +import edu.ucdenver.ccp.common.collections.CollectionsUtil; import edu.ucdenver.ccp.common.reflection.PrivateAccessor; import edu.ucdenver.ccp.common.string.StringConstants; import edu.ucdenver.ccp.datasource.fileparsers.DataRecord; @@ -60,6 +61,8 @@ import edu.ucdenver.ccp.datasource.fileparsers.RecordField; import edu.ucdenver.ccp.datasource.fileparsers.RecordUtil; import edu.ucdenver.ccp.datasource.identifiers.DataSource; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier; import edu.ucdenver.ccp.datasource.rdfizer.rdf.filter.DuplicateStatementFilter; import edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.RdfRecordUriFactory.IncludeVersion; import edu.ucdenver.ccp.datasource.rdfizer.rdf.vocabulary.DC; @@ -72,26 +75,31 @@ /** * Static utility functions for creating RDF * - * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu + * @author Colorado Computational Pharmacology, UC Denver; + * ccpsupport@ucdenver.edu */ public class RdfRecordUtil { - private static final Logger logger = Logger.getLogger(RdfRecordUtil.class); - + // private static final Logger logger = + // Logger.getLogger(RdfRecordUtil.class); + // /** // * // * // * @param recordClass // * @return Collection of created statements // */ - // public static Collection getRecordSchemaStatements(Class recordClass) + // public static Collection + // getRecordSchemaStatements(Class recordClass) // { // Collection stmts = new LinkedHashSet(); - // RdfNamespace ns = RdfNamespace.getNamespace(RecordUtil.getRecordDataSource(recordClass)); + // RdfNamespace ns = + // RdfNamespace.getNamespace(RecordUtil.getRecordDataSource(recordClass)); // String recordComment = RecordUtil.getRecordComment(recordClass); // String recordVersion = RecordUtil.getRecordSchemaVersion(recordClass); // - // URIImpl recordClsUri = RdfUtil.createKiaoUri(ns, recordClass.getSimpleName()); + // URIImpl recordClsUri = RdfUtil.createKiaoUri(ns, + // recordClass.getSimpleName()); // stmts.add(new StatementImpl(recordClsUri, RDFS.SUBCLASS_OF.uri(), // IAO.INFORMATION_CONTENT_ENITITY.uri())); // if (recordComment != null && !recordComment.isEmpty()) { @@ -103,7 +111,8 @@ public class RdfRecordUtil { // // Map fieldToRecordFieldAnnotationMap = RecordUtil // .getFieldToRecordFieldAnnotationsMap(recordClass); - // for (Entry entry : fieldToRecordFieldAnnotationMap.entrySet()) { + // for (Entry entry : + // fieldToRecordFieldAnnotationMap.entrySet()) { // if (isFieldSubRecord(entry.getKey())) { // Field f = entry.getKey(); // if (Collection.class.isAssignableFrom(f.getType())) { @@ -114,7 +123,8 @@ public class RdfRecordUtil { // stmts.addAll(getRecordSchemaStatements((Class) genericTypes[0])); // } // } else { - // throw new IllegalStateException("Non-parameterized collection detected in record class: " + // throw new + // IllegalStateException("Non-parameterized collection detected in record class: " // + recordClass.getName() + " Please parameterize."); // } // } else { @@ -122,14 +132,18 @@ public class RdfRecordUtil { // } // } // String fieldName = entry.getKey().getName(); - // String fieldComment = RecordUtil.getRecordFieldComment(recordClass, fieldName); - // String fieldVersion = RecordUtil.getRecordFieldVersion(recordClass, fieldName); + // String fieldComment = RecordUtil.getRecordFieldComment(recordClass, + // fieldName); + // String fieldVersion = RecordUtil.getRecordFieldVersion(recordClass, + // fieldName); // boolean isKeyField = RecordUtil.isKeyRecordField(recordClass, fieldName); // - // URIImpl fieldTemplateUri = RdfRecordUriFactory.createDataFieldTemplateUri(recordClass, + // URIImpl fieldTemplateUri = + // RdfRecordUriFactory.createDataFieldTemplateUri(recordClass, // fieldName, // IncludeVersion.YES); - // stmts.add(new StatementImpl(recordClsUri, RO.HAS_PART.uri(), fieldTemplateUri)); + // stmts.add(new StatementImpl(recordClsUri, RO.HAS_PART.uri(), + // fieldTemplateUri)); // if (fieldComment != null && !fieldComment.isEmpty()) { // stmts.add(new StatementImpl(fieldTemplateUri, RDFS.COMMENT.uri(), // RdfUtil.createLiteral(fieldComment))); @@ -140,7 +154,8 @@ public class RdfRecordUtil { // if (isKeyField) { // //stmts.add(new StatementImpl(fieldTemplateUri, DC.IDENTIFIER.uri(), // RdfUtil.createLiteral(isKeyField))); - // stmts.add(new StatementImpl(recordClsUri, KIAO.HAS_KEY_FIELD.uri(), fieldTemplateUri)); + // stmts.add(new StatementImpl(recordClsUri, KIAO.HAS_KEY_FIELD.uri(), + // fieldTemplateUri)); // } // } // @@ -148,8 +163,10 @@ public class RdfRecordUtil { // } // /** - // * Generate statements about datasets, records and their types for specified namespace within - // * KABOB namespace. Each class represents a dataset made up of its class of records and their + // * Generate statements about datasets, records and their types for + // specified namespace within + // * KABOB namespace. Each class represents a dataset made up of its class + // of records and their // * fields. // * // * @param recordTypes @@ -157,11 +174,13 @@ public class RdfRecordUtil { // * target namespace // * @return statements // */ - // public static List getRecordSchemaDefinitionStatements(Class + // getRecordSchemaDefinitionStatements(Class recordClass) { // List statements = new ArrayList(); // - // RdfNamespace ns = RdfNamespace.getNamespace(RecordUtil.getRecordDataSource(recordClass)); + // RdfNamespace ns = + // RdfNamespace.getNamespace(RecordUtil.getRecordDataSource(recordClass)); // // URIImpl dataSourceUri = RdfUtil.createKiaoUri(ns, ns.lowerName() + // KIAO.KABOB_DATASOURCE.termName()); @@ -188,14 +207,16 @@ public class RdfRecordUtil { // statements.add(new StatementImpl(fieldUri, RDFS.SUBCLASS_OF.uri(), // KIAO.KABOB_DATAFIELD.uri())); // - // statements.addAll(getRecordFieldDeclarationStatements(recordClass, null)); + // statements.addAll(getRecordFieldDeclarationStatements(recordClass, + // null)); // // return statements; // } /** - * Generate statements about class' fields specified namespace within KABOB namespace. Each - * field is a subclass of generic field in namespace and part of dataset. + * Generate statements about class' fields specified namespace within KABOB + * namespace. Each field is a subclass of generic field in namespace and + * part of dataset. * * * @param recordClass @@ -205,13 +226,14 @@ public class RdfRecordUtil { * @param version * structural version label * @param parentSchemaUri - * if not null, record schema is asserted to be {@link RdfPredicate#RO_PARTOF} parent - * schema. + * if not null, record schema is asserted to be + * {@link RdfPredicate#RO_PARTOF} parent schema. * @param fieldComment - * the field comment is used to capture @RecordField comments on fields that are - * subrecords + * the field comment is used to capture @RecordField comments on + * fields that are subrecords * @param isKeyField - * @return statements about fields; empty result is returned for anonymous classes. + * @return statements about fields; empty result is returned for anonymous + * classes. */ public static Collection getRecordSchemaStatements(Class recordClass, URIImpl parentSchemaUri, String fieldComment, boolean isKeyField) { @@ -219,16 +241,18 @@ public static Collection getRecordSchemaStatements(Class Collection statements = new ArrayList(); /* - * The following two statements are meta statements that will be redundant if multiple - * record schemas are combined. Note that the first statement is supposed to be a self-loop. + * The following two statements are meta statements that will be + * redundant if multiple record schemas are combined. Note that the + * first statement is supposed to be a self-loop. */ statements.add(new StatementImpl(KIAO.SCHEMA.uri(), RO.HAS_PART.uri(), KIAO.SCHEMA.uri())); statements.add(new StatementImpl(KIAO.SCHEMA.uri(), RO.HAS_PART.uri(), KIAO.FIELD.uri())); /* - * The following adds the kiaosource:Record rdfs:subClassOf iao:IAO_0000030 (information - * content entity) triple. This triple is not really part of the schema, however it only - * needs to be added one time so this seems like a good place to put it. + * The following adds the kiaosource:Record rdfs:subClassOf + * iao:IAO_0000030 (information content entity) triple. This triple is + * not really part of the schema, however it only needs to be added one + * time so this seems like a good place to put it. */ URIImpl recordClsUri = RdfUtil.createKiaoUri(ns, recordClass.getSimpleName()); statements.add(new StatementImpl(recordClsUri, RDFS.SUBCLASS_OF.uri(), IAO.INFORMATION_CONTENT_ENITITY.uri())); @@ -272,8 +296,8 @@ public static Collection getRecordSchemaStatements(Class for (Field field : sortedFields) { /* - * If the RecordField annotation is not present, then this field does not get serialized - * in the RDF, e.g. the logger field + * If the RecordField annotation is not present, then this field + * does not get serialized in the RDF, e.g. the logger field */ if (field.isAnnotationPresent(RecordField.class)) { String fComment = RecordUtil.getRecordFieldComment(recordClass, field.getName()); @@ -330,9 +354,10 @@ private static String getFieldLabel(Class recordClass, String fieldName) { /** * @param recordClass - * @return a label for the record by first looking for an explicitly defined label in the @Record - * annotation. If not present, a label is generated by adding spaces to replace - * camel-case in the Record name + * @return a label for the record by first looking for an explicitly defined + * label in the @Record annotation. If not present, a label is + * generated by adding spaces to replace camel-case in the Record + * name */ private static String getRecordLabel(Class recordClass) { String label = RecordUtil.getRecordLabel(recordClass); @@ -351,7 +376,8 @@ private static String getRecordLabel(Class recordClass) { * * @param field * to check - * @return field type, or generic type if field's type is a {@link Collection} + * @return field type, or generic type if field's type is a + * {@link Collection} */ private static Class getFieldType(Field field) { Class klass = field.getType(); @@ -370,8 +396,8 @@ private static Class getFieldType(Field field) { } /** - * Determine whether class should be treated as sub-record definition. If field type is - * collection, it's generic type is used. + * Determine whether class should be treated as sub-record definition. If + * field type is collection, it's generic type is used. * * @param field * to check @@ -403,12 +429,16 @@ private static boolean isFieldSubRecord(Field field) { private static boolean isFieldSubRecord(Class klass) { return klass.isAnnotationPresent(Record.class); // return DataRecord.class.isAssignableFrom(klass); - // if (!(DataSourceElement.class.isAssignableFrom(klass) || klass.isPrimitive() || + // if (!(DataSourceElement.class.isAssignableFrom(klass) || + // klass.isPrimitive() || // klass.isArray() // || klass.isEnum() || klass.isSynthetic() || klass.isAnnotation() - // || Collection.class.isAssignableFrom(klass) || String.class.isAssignableFrom(klass) - // || Number.class.isAssignableFrom(klass) || Boolean.class.isAssignableFrom(klass) - // || java.util.Date.class.isAssignableFrom(klass) || URI.class.isAssignableFrom(klass) || + // || Collection.class.isAssignableFrom(klass) || + // String.class.isAssignableFrom(klass) + // || Number.class.isAssignableFrom(klass) || + // Boolean.class.isAssignableFrom(klass) + // || java.util.Date.class.isAssignableFrom(klass) || + // URI.class.isAssignableFrom(klass) || // URL.class // .isAssignableFrom(klass))) { // return true; @@ -418,7 +448,8 @@ private static boolean isFieldSubRecord(Class klass) { } /** - * Get collection of statements that instance datasource, records and fields for given record. + * Get collection of statements that instance datasource, records and fields + * for given record. * * @param record * @param src @@ -459,13 +490,13 @@ public static List getDataSourceInstanceStatements(DataReco } /** - * Generate instance statements about this particular instance of {@link DataRecord}. Statements - * include assertions about record and it's fields types and values. All record fields are - * included. + * Generate instance statements about this particular instance of + * {@link DataRecord}. Statements include assertions about record and it's + * fields types and values. All record fields are included. * * @param record * instance - * @param filter + * @param filter * @param src * record source * @param alreadyObservedFieldUris @@ -473,14 +504,16 @@ public static List getDataSourceInstanceStatements(DataReco * record instance index * @return statements */ - public static List getRecordInstanceStatements(DataRecord record, long createdTime, URIImpl recordUri, DuplicateStatementFilter filter) { + public static List getRecordInstanceStatements(DataRecord record, long createdTime, URIImpl recordUri, + DuplicateStatementFilter filter) { return getRecordInstanceStatements(record, createdTime, recordUri, null, StringConstants.BLANK, filter); } /** - * Generate instance statements about this particular instance of {@link DataRecord}. Statements - * include assertions about record and it's fields types and values. {@code rdfFields} will be - * used to determine record exclusion rules and output format. + * Generate instance statements about this particular instance of + * {@link DataRecord}. Statements include assertions about record and it's + * fields types and values. {@code rdfFields} will be used to determine + * record exclusion rules and output format. * * @param record * instance @@ -491,13 +524,14 @@ public static List getRecordInstanceStatements(DataRecord record, lon * @param rdfFields * configuration info for field export * @param parentRecordUri - * if not null, used to indicate that record is a subrecord within record described - * by this value + * if not null, used to indicate that record is a subrecord + * within record described by this value * @param readerKey - * label used in generating dataset instance URI; if null, converted to - * {@link StringConstants#BLANK} + * label used in generating dataset instance URI; if null, + * converted to {@link StringConstants#BLANK} * @param alreadyObservedFieldUris - * @return statements ; empty result is returned for anonymous {@code record} class. + * @return statements ; empty result is returned for anonymous + * {@code record} class. */ public static List getRecordInstanceStatements(Object record, long createdTime, URIImpl recordUri, URIImpl parentRecordUri, String readerKey, DuplicateStatementFilter filter) { @@ -538,14 +572,17 @@ public static List getRecordInstanceStatements(Object record, long cr // record instance has template record schema // URIImpl recordSchemaUri = RdfUtil.createKiaoUri( // targetNs, - // targetNs.lowerName() + record.getClass().getSimpleName() + KIAO.KABOB_SCHEMA.termName() + // targetNs.lowerName() + record.getClass().getSimpleName() + + // KIAO.KABOB_SCHEMA.termName() // + RecordUtil.getRecordSchemaVersion(record.getClass())); URIImpl recordSchemaUri = RdfRecordUriFactory.createRecordSchemaUri(record.getClass(), IncludeVersion.YES); statements.add(new StatementImpl(recordUri, KIAO.HAS_TEMPLATE.uri(), recordSchemaUri)); Set fields = RecordUtil.getFieldToRecordFieldAnnotationsMap(record.getClass()).keySet(); List sortedFields = new ArrayList(fields); - Collections.sort(sortedFields, new FieldNameComparator()); // sorted to ease unit testing + Collections.sort(sortedFields, new FieldNameComparator()); // sorted to + // ease unit + // testing for (Field field : sortedFields) { if (isFieldSubRecord(field)) { @@ -568,13 +605,16 @@ public static List getRecordInstanceStatements(Object record, long cr } } else { statements.addAll(getSubrecordStatements(createdTime, recordUri, readerKey, filter, subRecord)); - // URIImpl subRecordUri = RdfRecordUriFactory.createRecordUri(subRecord); - // statements.addAll(getRecordInstanceStatements(subRecord, createdTime, + // URIImpl subRecordUri = + // RdfRecordUriFactory.createRecordUri(subRecord); + // statements.addAll(getRecordInstanceStatements(subRecord, + // createdTime, // subRecordUri, recordUri, // readerKey)); } } else { - Collection fieldValueStmts = getRdfFieldValueStatements(recordUri, record, field); + Collection fieldValueStmts = getRdfFieldValueStatements(recordUri, record, field, + createdTime, filter); if (fieldValueStmts.isEmpty()) { continue; } @@ -599,13 +639,13 @@ private static List getSubrecordStatements(long createdTime, URIImpl DuplicateStatementFilter filter, Object r) { List statements = new ArrayList(); URIImpl subRecordUri = RdfRecordUriFactory.createRecordUri(r); - List subRecordStmts = getRecordInstanceStatements(r, createdTime, subRecordUri, - recordUri, readerKey, filter); + List subRecordStmts = getRecordInstanceStatements(r, createdTime, subRecordUri, recordUri, + readerKey, filter); if (!filter.alreadyObservedRecordUri(subRecordUri)) { statements.addAll(subRecordStmts); filter.logRecordUri(subRecordUri); } else { -// logger.info("already seen subrecord"); + // logger.info("already seen subrecord"); statements.add(subRecordStmts.get(0)); } return statements; @@ -658,41 +698,46 @@ private static Collection linkFieldToRecord(URIImpl recordUri, URIImp * Generate statements about record's field. * * @param fieldInstanceUri - * initial field instance URI; template re-used if field type is a {@link Collection} + * initial field instance URI; template re-used if field type is + * a {@link Collection} * @param record * instance with specified field + * @param filter * @param fieldName * field name * @param commonFieldStatements * shared template statements to be asserted about every field * @return statements */ - private static Collection getRdfFieldValueStatements(URIImpl recordUri, Object record, Field field) { - Object fieldValue = PrivateAccessor.getFieldValue(record, field.getName()); + private static Collection getRdfFieldValueStatements(URIImpl recordUri, Object record, Field field, + long createdTime, DuplicateStatementFilter filter) { + Object fieldValue = PrivateAccessor.getFieldValue(record, field.getName()); if (fieldValue == null) { return new ArrayList(); } - int fieldCount = 0; Collection statements = new ArrayList(); if (!(fieldValue instanceof Collection)) { - fieldCount = 1; URIImpl fieldUri = RdfRecordUriFactory.createFieldUri(record, field, fieldValue); statements.addAll(linkFieldToRecord(recordUri, fieldUri)); statements.addAll(createCommonFieldStatements(record, recordUri, fieldUri, field.getName())); - statements.add(getFieldDenotesValueStatement(fieldUri, fieldValue)); + statements.addAll(getFieldDenotesValueStatement(fieldUri, fieldValue, createdTime, filter)); } else { - /* for each element in the collection a new fieldInstanceUri is generated */ + /* + * for each element in the collection a new fieldInstanceUri is + * generated + */ Collection coll = (Collection) fieldValue; for (Object object : coll) { URIImpl fieldUri = RdfRecordUriFactory.createFieldUri(record, field, object); if (fieldUri != null) { statements.addAll(linkFieldToRecord(recordUri, fieldUri)); - statements.add(getFieldDenotesValueStatement(fieldUri, object)); + statements.addAll(getFieldDenotesValueStatement(fieldUri, object, createdTime, filter)); statements.addAll(createCommonFieldStatements(record, recordUri, fieldUri, field.getName())); } } - // int startingFieldCount = Integer.valueOf(fieldInstanceUri.substring(fieldInstanceUri + // int startingFieldCount = + // Integer.valueOf(fieldInstanceUri.substring(fieldInstanceUri // .lastIndexOf(FIELD_VALUE) + 1)) - 1; // // Collection coll = (Collection) fieldValue; @@ -713,8 +758,9 @@ private static Collection getRdfFieldValueStatements(URIImpl recordUr } /** - * Generate statements about field (represented by {@code fieldInstanceUri}, and also a Subject - * in RDF statement) and field's value. Statements generated:
+ * Generate statements about field (represented by {@code fieldInstanceUri}, + * and also a Subject in RDF statement) and field's value. Statements + * generated:
* *
 	 *    .
@@ -724,16 +770,66 @@ private static Collection getRdfFieldValueStatements(URIImpl recordUr
 	 *            rdf field instance URI (subject)
 	 * @param fieldValue
 	 *            value
+	 * @param filter
 	 * @throws IllegalArgumentException
 	 *             if fieldValue's type is {@link Collection}
 	 * @return statements
 	 */
-	public static Statement getFieldDenotesValueStatement(URIImpl fieldInstanceUri, Object fieldValue) {
+	public static List getFieldDenotesValueStatement(URIImpl fieldInstanceUri, Object fieldValue,
+			long createdTime, DuplicateStatementFilter filter) {
 		if (fieldValue instanceof Collection) {
 			throw new IllegalArgumentException("Collection fieldValue is not supported");
 		}
 		Value value = RdfUtil.getValue(fieldValue);
-		return new StatementImpl(fieldInstanceUri, IAO.DENOTES.uri(), value);
+
+		List stmts = new ArrayList();
+		/*
+		 * if we encounter a data source identifier that is declared either
+		 * unknown or a probable error, we create a record to hold the
+		 * identifier and optional data source string. The field then denotes
+		 * this new record. Unknown data source identifiers occur when the file
+		 * parsing code comes across an identifier for which it does not know
+		 * how to generate an appropriate URI. Perhaps "unknown" is not the
+		 * prefix to use here. Probably erroneous identifiers are identifiers
+		 * that the parsing code has detected to be incorrect, e.g. an UniProt
+		 * identifier that does not follow the regular expression pattern
+		 * stipulated by UniProt.
+		 */
+		if (fieldValue instanceof UnknownDataSourceIdentifier) {
+			UnknownDataSourceIdentifier id = (UnknownDataSourceIdentifier) fieldValue;
+			NonNormalizedIdentifierRecord record = new NonNormalizedIdentifierRecord(id.getDataElement(), id.getDataSourceStr());
+			URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record);
+			URIImpl parentRecordUri = null;
+			String readerKey = null;
+			List recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, createdTime,
+					recordUri, parentRecordUri, readerKey, filter);
+			/*
+			 * the first statement returned is a dataset has_part record triple
+			 * which we do not need
+			 */
+			recordInstanceStatements.remove(0);
+			stmts.add(new StatementImpl(fieldInstanceUri, IAO.DENOTES.uri(), recordUri));
+			stmts.addAll(recordInstanceStatements);
+		} else if (fieldValue instanceof ProbableErrorDataSourceIdentifier) {
+			ProbableErrorDataSourceIdentifier id = (ProbableErrorDataSourceIdentifier) fieldValue;
+			ErroneousIdentifierRecord record = new ErroneousIdentifierRecord(id.getDataElement(),
+					id.getDataSourceStr(), id.getErrorMessage());
+			URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record);
+			URIImpl parentRecordUri = null;
+			String readerKey = null;
+			List recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, createdTime,
+					recordUri, parentRecordUri, readerKey, filter);
+			/*
+			 * the first statement returned is a dataset has_part record triple
+			 * which we do not need
+			 */
+			recordInstanceStatements.remove(0);
+			stmts.add(new StatementImpl(fieldInstanceUri, IAO.DENOTES.uri(), recordUri));
+			stmts.addAll(recordInstanceStatements);
+		} else {
+			stmts.add(new StatementImpl(fieldInstanceUri, IAO.DENOTES.uri(), value));
+		}
+		return stmts;
 
 	}
 
diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java
index 451db76..2d6c6e5 100644
--- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java
+++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java
@@ -54,23 +54,17 @@
 import org.apache.log4j.Logger;
 import org.openrdf.model.Resource;
 import org.openrdf.model.Statement;
-import org.openrdf.model.Value;
 import org.openrdf.model.impl.URIImpl;
 import org.openrdf.rio.RDFHandlerException;
 import org.openrdf.rio.RDFWriter;
 
-import edu.ucdenver.ccp.common.collections.CollectionsUtil;
 import edu.ucdenver.ccp.common.file.CharacterEncoding;
 import edu.ucdenver.ccp.common.file.FileUtil;
-import edu.ucdenver.ccp.common.reflection.PrivateAccessor;
 import edu.ucdenver.ccp.common.string.StringConstants;
 import edu.ucdenver.ccp.datasource.fileparsers.DataRecord;
 import edu.ucdenver.ccp.datasource.fileparsers.RecordReader;
 import edu.ucdenver.ccp.datasource.fileparsers.RecordUtil;
-import edu.ucdenver.ccp.datasource.identifiers.DataSourceElement;
-import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier;
 import edu.ucdenver.ccp.datasource.identifiers.DataSource;
-import edu.ucdenver.ccp.datasource.rdfizer.rdf.RdfId;
 import edu.ucdenver.ccp.datasource.rdfizer.rdf.filter.DuplicateStatementFilter;
 import edu.ucdenver.ccp.datasource.rdfizer.rdf.filter.InMemoryDuplicateStatementFilter;
 import edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.RdfUtil.RdfFormat;
@@ -459,192 +453,192 @@ private void processRecord(DataRecord record, String readerKey, URIImpl recordUr
 		}
 	}
 
-	/**
-	 * Constant is assumed to be a valid URI String
-	 * 
-	 * @param tripleObj
-	 * @param 
-	 * @return
-	 */
-	private  Map, Collection> getConstantValues(String value) {
-		Map, Collection> type2valuesMap = new HashMap, Collection>();
-		Value constantValue = new URIImpl(value);
-		CollectionsUtil.addToOne2ManyMap(String.class, constantValue, type2valuesMap);
-		return type2valuesMap;
-	}
-
-	/**
-	 * 
-	 * @param 
-	 * @param record
-	 * @param tripleObj
-	 * @return
-	 */
-	private  Map, Collection> getLiteralValues(E record, String fieldName) {
-		Map, Collection> type2valuesMap = new HashMap, Collection>();
-		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
-		if (fieldValue == null)
-			return type2valuesMap;
-		if (fieldValue instanceof DataSourceElement) {
-			DataSourceElement element = (DataSourceElement) fieldValue;
-			Value literalValue = RdfUtil.createLiteral(element.getDataElement());
-			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), literalValue, type2valuesMap);
-			return type2valuesMap;
-		}
-		if (fieldValue instanceof Collection) {
-			for (Object value : ((Collection) fieldValue))
-				if (value instanceof DataSourceElement) {
-					DataSourceElement element = (DataSourceElement) fieldValue;
-					Value literalValue = RdfUtil.createLiteral(element.getDataElement());
-					CollectionsUtil.addToOne2ManyMap(value.getClass(), literalValue, type2valuesMap);
-				} else
-					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
-							+ "Expected Collection but instead observed Collection<%s>.", fieldName,
-							value.getClass().getName()));
-			return type2valuesMap;
-		}
-		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
-				fieldName, fieldValue.toString()));
-	}
-
-	/**
-	 * Get values for triple definition where value is specified to use ICE formatting (ex:
-	 * {@code ensemblGeneId})
-	 * 
-	 * @param record
-	 * @param tripleObj
-	 * @return values
-	 */
-	private Map, Collection> getInformationContentEntityIDValues(DataRecord record, String fieldName) {
-		Map, Collection> type2valuesMap = new HashMap, Collection>();
-		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
-		if (fieldValue == null)
-			return type2valuesMap;
-
-		if (fieldValue instanceof DataSourceIdentifier) {
-			DataSourceIdentifier id = (DataSourceIdentifier) fieldValue;
-			RdfId rdfId = new RdfId(id);
-			Value iceIdValue = new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), rdfId.getICE_ID()).toString());
-			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), iceIdValue, type2valuesMap);
-			return type2valuesMap;
-		}
-
-		if (fieldValue instanceof Collection) {
-			for (Object value : ((Collection) fieldValue))
-				if (value instanceof DataSourceElement) {
-					DataSourceIdentifier id = (DataSourceIdentifier) value;
-					RdfId rdfId = new RdfId(id);
-					Value iceIdValue = new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), rdfId.getICE_ID())
-							.toString());
-					CollectionsUtil.addToOne2ManyMap(value.getClass(), iceIdValue, type2valuesMap);
-				} else
-					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
-							+ "Expected Collection> but instead observed Collection<%s>.",
-							fieldName, value.getClass().getName()));
-			return type2valuesMap;
-		}
-
-		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
-				fieldName, fieldValue.toString()));
-	}
-
-	/**
-	 * Parser {@link DataRecord} from field of record.
-	 * 
-	 * @param 
-	 *            record type
-	 * @param record
-	 *            instance
-	 * @param fieldName
-	 *            field in record
-	 * @return record
-	 */
-	private  Map, Collection> getValues(E record, String fieldName) {
-		Map, Collection> type2valuesMap = new HashMap, Collection>();
-		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
-		if (fieldValue == null)
-			return type2valuesMap;
-
-		if (fieldValue instanceof DataSourceElement) {
-			DataSourceElement id = (DataSourceElement) fieldValue;
-			Value rdfValue = null;
-
-			if (id instanceof DataSourceIdentifier) {
-				RdfId rdfId = new RdfId((DataSourceIdentifier) id);
-				rdfValue = rdfId.getRdfValue();
-			} else
-				rdfValue = RdfUtil.createLiteral(id.getDataElement());
-
-			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), rdfValue, type2valuesMap);
-			return type2valuesMap;
-		}
-
-		if (fieldValue instanceof Collection) {
-			for (Object value : ((Collection) fieldValue)) {
-				if (value instanceof DataSourceElement) {
-					DataSourceElement id = (DataSourceElement) value;
-					Value rdfValue = null;
-
-					if (id instanceof DataSourceIdentifier) {
-						RdfId rdfId = new RdfId((DataSourceIdentifier) id);
-						rdfValue = rdfId.getRdfValue();
-					} else
-						rdfValue = RdfUtil.createLiteral(id.getDataElement());
-
-					CollectionsUtil.addToOne2ManyMap(value.getClass(), rdfValue, type2valuesMap);
-				} else {
-					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
-							+ "Expected Collection but instead observed Collection<%s>.", fieldName,
-							value.getClass().getName()));
-				}
-			}
-
-			return type2valuesMap;
-		}
-
-		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
-				fieldName, fieldValue.toString()));
-	}
-
-	/**
-	 * Returns the subject Resource representation of the value of the field with the given name
-	 * contained in the input DataRecord. The field must be of type ResourceIdentifier.
-	 * 
-	 * @param record
-	 * @param fieldName
-	 * @return
-	 * 
-	 */
-	private Collection getSubjectResources(DataRecord record, String fieldName) {
-		Collection resources = new ArrayList();
-		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
-
-		if (fieldValue instanceof DataSourceIdentifier) {
-			DataSourceIdentifier id = (DataSourceIdentifier) fieldValue;
-			RdfId rdfId = new RdfId(id);
-			resources.add(new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), id.toString()).toString()));
-			return resources;
-		}
-
-		if (fieldValue instanceof Collection) {
-			for (Object resource : ((Collection) fieldValue))
-				if (resource instanceof DataSourceIdentifier) {
-					DataSourceIdentifier id = (DataSourceIdentifier) resource;
-					RdfId rdfId = new RdfId(id);
-					resources.add(new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), id.toString()).toString()));
-				} else {
-					String message = String.format("Unable to extract RDF subject from field: %s. "
-							+ "Expected Collection but instead observed Collection<%s>.",
-							fieldName, resource.getClass().getName());
-					throw new RuntimeException(message);
-				}
-
-			return resources;
-		}
-
-		throw new RuntimeException(String.format("Unable to extract RDF subject from field: %s (observedValue=%s)",
-				fieldName, fieldValue.toString()));
-	}
+//	/**
+//	 * Constant is assumed to be a valid URI String
+//	 * 
+//	 * @param tripleObj
+//	 * @param 
+//	 * @return
+//	 */
+//	private  Map, Collection> getConstantValues(String value) {
+//		Map, Collection> type2valuesMap = new HashMap, Collection>();
+//		Value constantValue = new URIImpl(value);
+//		CollectionsUtil.addToOne2ManyMap(String.class, constantValue, type2valuesMap);
+//		return type2valuesMap;
+//	}
+
+//	/**
+//	 * 
+//	 * @param 
+//	 * @param record
+//	 * @param tripleObj
+//	 * @return
+//	 */
+//	private  Map, Collection> getLiteralValues(E record, String fieldName) {
+//		Map, Collection> type2valuesMap = new HashMap, Collection>();
+//		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
+//		if (fieldValue == null)
+//			return type2valuesMap;
+//		if (fieldValue instanceof DataSourceElement) {
+//			DataSourceElement element = (DataSourceElement) fieldValue;
+//			Value literalValue = RdfUtil.createLiteral(element.getDataElement());
+//			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), literalValue, type2valuesMap);
+//			return type2valuesMap;
+//		}
+//		if (fieldValue instanceof Collection) {
+//			for (Object value : ((Collection) fieldValue))
+//				if (value instanceof DataSourceElement) {
+//					DataSourceElement element = (DataSourceElement) fieldValue;
+//					Value literalValue = RdfUtil.createLiteral(element.getDataElement());
+//					CollectionsUtil.addToOne2ManyMap(value.getClass(), literalValue, type2valuesMap);
+//				} else
+//					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
+//							+ "Expected Collection but instead observed Collection<%s>.", fieldName,
+//							value.getClass().getName()));
+//			return type2valuesMap;
+//		}
+//		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
+//				fieldName, fieldValue.toString()));
+//	}
+
+//	/**
+//	 * Get values for triple definition where value is specified to use ICE formatting (ex:
+//	 * {@code ensemblGeneId})
+//	 * 
+//	 * @param record
+//	 * @param tripleObj
+//	 * @return values
+//	 */
+//	private Map, Collection> getInformationContentEntityIDValues(DataRecord record, String fieldName) {
+//		Map, Collection> type2valuesMap = new HashMap, Collection>();
+//		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
+//		if (fieldValue == null)
+//			return type2valuesMap;
+//
+//		if (fieldValue instanceof DataSourceIdentifier) {
+//			DataSourceIdentifier id = (DataSourceIdentifier) fieldValue;
+//			RdfId rdfId = new RdfId(id);
+//			Value iceIdValue = new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), rdfId.getICE_ID()).toString());
+//			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), iceIdValue, type2valuesMap);
+//			return type2valuesMap;
+//		}
+//
+//		if (fieldValue instanceof Collection) {
+//			for (Object value : ((Collection) fieldValue))
+//				if (value instanceof DataSourceElement) {
+//					DataSourceIdentifier id = (DataSourceIdentifier) value;
+//					RdfId rdfId = new RdfId(id);
+//					Value iceIdValue = new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), rdfId.getICE_ID())
+//							.toString());
+//					CollectionsUtil.addToOne2ManyMap(value.getClass(), iceIdValue, type2valuesMap);
+//				} else
+//					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
+//							+ "Expected Collection> but instead observed Collection<%s>.",
+//							fieldName, value.getClass().getName()));
+//			return type2valuesMap;
+//		}
+//
+//		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
+//				fieldName, fieldValue.toString()));
+//	}
+
+//	/**
+//	 * Parser {@link DataRecord} from field of record.
+//	 * 
+//	 * @param 
+//	 *            record type
+//	 * @param record
+//	 *            instance
+//	 * @param fieldName
+//	 *            field in record
+//	 * @return record
+//	 */
+//	private  Map, Collection> getValues(E record, String fieldName) {
+//		Map, Collection> type2valuesMap = new HashMap, Collection>();
+//		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
+//		if (fieldValue == null)
+//			return type2valuesMap;
+//
+//		if (fieldValue instanceof DataSourceElement) {
+//			DataSourceElement id = (DataSourceElement) fieldValue;
+//			Value rdfValue = null;
+//
+//			if (id instanceof DataSourceIdentifier) {
+//				RdfId rdfId = new RdfId((DataSourceIdentifier) id);
+//				rdfValue = rdfId.getRdfValue();
+//			} else
+//				rdfValue = RdfUtil.createLiteral(id.getDataElement());
+//
+//			CollectionsUtil.addToOne2ManyMap(fieldValue.getClass(), rdfValue, type2valuesMap);
+//			return type2valuesMap;
+//		}
+//
+//		if (fieldValue instanceof Collection) {
+//			for (Object value : ((Collection) fieldValue)) {
+//				if (value instanceof DataSourceElement) {
+//					DataSourceElement id = (DataSourceElement) value;
+//					Value rdfValue = null;
+//
+//					if (id instanceof DataSourceIdentifier) {
+//						RdfId rdfId = new RdfId((DataSourceIdentifier) id);
+//						rdfValue = rdfId.getRdfValue();
+//					} else
+//						rdfValue = RdfUtil.createLiteral(id.getDataElement());
+//
+//					CollectionsUtil.addToOne2ManyMap(value.getClass(), rdfValue, type2valuesMap);
+//				} else {
+//					throw new RuntimeException(String.format("Unable to extract RDF object from field: %s. "
+//							+ "Expected Collection but instead observed Collection<%s>.", fieldName,
+//							value.getClass().getName()));
+//				}
+//			}
+//
+//			return type2valuesMap;
+//		}
+//
+//		throw new RuntimeException(String.format("Unable to extract RDF object from field: %s (observedValue=%s)",
+//				fieldName, fieldValue.toString()));
+//	}
+//
+//	/**
+//	 * Returns the subject Resource representation of the value of the field with the given name
+//	 * contained in the input DataRecord. The field must be of type ResourceIdentifier.
+//	 * 
+//	 * @param record
+//	 * @param fieldName
+//	 * @return
+//	 * 
+//	 */
+//	private Collection getSubjectResources(DataRecord record, String fieldName) {
+//		Collection resources = new ArrayList();
+//		Object fieldValue = PrivateAccessor.getFieldValue(record, fieldName);
+//
+//		if (fieldValue instanceof DataSourceIdentifier) {
+//			DataSourceIdentifier id = (DataSourceIdentifier) fieldValue;
+//			RdfId rdfId = new RdfId(id);
+//			resources.add(new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), id.toString()).toString()));
+//			return resources;
+//		}
+//
+//		if (fieldValue instanceof Collection) {
+//			for (Object resource : ((Collection) fieldValue))
+//				if (resource instanceof DataSourceIdentifier) {
+//					DataSourceIdentifier id = (DataSourceIdentifier) resource;
+//					RdfId rdfId = new RdfId(id);
+//					resources.add(new URIImpl(RdfUtil.createKiaoUri(rdfId.getNamespace(), id.toString()).toString()));
+//				} else {
+//					String message = String.format("Unable to extract RDF subject from field: %s. "
+//							+ "Expected Collection but instead observed Collection<%s>.",
+//							fieldName, resource.getClass().getName());
+//					throw new RuntimeException(message);
+//				}
+//
+//			return resources;
+//		}
+//
+//		throw new RuntimeException(String.format("Unable to extract RDF subject from field: %s (observedValue=%s)",
+//				fieldName, fieldValue.toString()));
+//	}
 
 	/**
 	 * Output RDF record to a file based on record's file key.
diff --git a/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplErroneousAndUnknownIdentifierTest.java b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplErroneousAndUnknownIdentifierTest.java
new file mode 100644
index 0000000..5edb920
--- /dev/null
+++ b/datasource-rdfizer/src/test/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImplErroneousAndUnknownIdentifierTest.java
@@ -0,0 +1,180 @@
+package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice;
+
+/*
+ * #%L
+ * Colorado Computational Pharmacology's common module
+ * %%
+ * Copyright (C) 2012 - 2015 Regents of the University of Colorado
+ * %%
+ * Redistribution and use in source and binary forms, with or without modification,
+ * are permitted provided that the following conditions are met:
+ * 
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ *    list of conditions and the following disclaimer.
+ * 
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ *    this list of conditions and the following disclaimer in the documentation
+ *    and/or other materials provided with the distribution.
+ * 
+ * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software without
+ *    specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+ * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ * #L%
+ */
+
+import static org.junit.Assert.assertTrue;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.GregorianCalendar;
+import java.util.List;
+
+import org.junit.Before;
+import org.junit.Test;
+
+import edu.ucdenver.ccp.common.collections.CollectionsUtil;
+import edu.ucdenver.ccp.common.file.CharacterEncoding;
+import edu.ucdenver.ccp.common.file.FileComparisonUtil;
+import edu.ucdenver.ccp.common.file.FileComparisonUtil.ColumnOrder;
+import edu.ucdenver.ccp.common.file.FileComparisonUtil.LineOrder;
+import edu.ucdenver.ccp.common.file.FileReaderUtil;
+import edu.ucdenver.ccp.common.file.FileUtil;
+import edu.ucdenver.ccp.common.file.FileWriterUtil;
+import edu.ucdenver.ccp.common.file.FileWriterUtil.FileSuffixEnforcement;
+import edu.ucdenver.ccp.common.file.FileWriterUtil.WriteMode;
+import edu.ucdenver.ccp.common.test.DefaultTestCase;
+import edu.ucdenver.ccp.datasource.fileparsers.pro.ProMappingFileParser;
+import edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.RdfUtil.RdfFormat;
+
+/**
+ * Testing using the protein ontology mapping file b/c it's a simple format and
+ * it has unknown and potentially erroneous data source identifiers.
+ */
+public class RdfRecordWriterImplErroneousAndUnknownIdentifierTest extends DefaultTestCase {
+
+	private File proMappingTxtFile_unknownIdentifier;
+	private File outputDirectory;
+	private final String expectedOutputFileName = "pr-ProMappingFileParser.0-0.nt";
+
+	@Before
+	public void setUp() throws Exception {
+		outputDirectory = folder.newFolder("output");
+		proMappingTxtFile_unknownIdentifier = folder.newFile("promapping.txt");
+		populateProMappingTxtFile_unknownIdentifier();
+	}
+
+	/**
+	 * PR:000000005 HGNC:11773 is_a 
+ * PR:000000005 UniProtKB_VAR:VAR_022359 is_a // unknown identifier type
+ * PR:000000006 UniProtKB:PABCDE exact // invalid UniProt ID
+ */ + private void populateProMappingTxtFile_unknownIdentifier() throws IOException { + List lines = CollectionsUtil.createList("PR:000000005\tHGNC:11773\tis_a", + "PR:000000005\tUniProtKB_VAR:VAR_022359\tis_a", "PR:000000006\tUniProtKB:PABCDE\texact"); + FileWriterUtil.printLines(lines, proMappingTxtFile_unknownIdentifier, CharacterEncoding.US_ASCII, + WriteMode.OVERWRITE, FileSuffixEnforcement.OFF); + } + + @Test + public void testWriteRdf_unknown_and_erroneous_identifiers() throws IOException { + ProMappingFileParser parser = new ProMappingFileParser(proMappingTxtFile_unknownIdentifier, + CharacterEncoding.US_ASCII); + RdfRecordWriterImpl recordWriter = new RdfRecordWriterImpl( + outputDirectory, RdfFormat.NTRIPLES); + long createdTimeInMillis20101217 = new GregorianCalendar(2010, 11, 17).getTimeInMillis(); + recordWriter.processRecordReader(parser, createdTimeInMillis20101217); + + File outputFile = FileUtil.appendPathElementsToDirectory(outputDirectory, expectedOutputFileName); + System.err.println("dir contents: " + Arrays.toString(outputDirectory.list())); + assertTrue("Output file should have been created.", outputFile.exists()); + + List linesFromFile = FileReaderUtil.loadLinesFromFile(outputFile, CharacterEncoding.UTF_8); + for (String l : linesFromFile) { + System.err.println(l); + } + + List expectedLines = getExpectedLines(); + assertTrue("N-Triple Lines should be as expected.", FileComparisonUtil.hasExpectedLines(outputFile, + CharacterEncoding.UTF_8, expectedLines, null, LineOrder.ANY_ORDER, ColumnOrder.AS_IN_FILE)); + } + + private List getExpectedLines() { + + return CollectionsUtil + .createList( + + " .", + " .", + " .", + " .", + " \"2010-12-17T00:00:00.000-07:00\"^^ .", + " .", + " .", + " .", + " .", + " .", + " .", + " \"is_a\"@en .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " \"UniProtKB_VAR:VAR_022359\"@en .", + " .", + " .", + " .", + " .", + " .", + " .", + " \"exact\"@en .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " .", + " \"Invalid UniProt ID: PABCDE. This ID does not comply with the specifications for UniProt accession numbers as defined here: http://www.uniprot.org/manual/accession_numbers\"@en .", + " .", + " .", + " .", + " \"UniProtKB:PABCDE\"@en ."); + } + +} From 35d9d009b90cc5357304377c54dab0b765813fd4 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 18:02:04 -0700 Subject: [PATCH 14/36] Modified to accept a list of datasource names as input --- .../rdfizer/rdf/ice/IceRdfGenerator.java | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java index 988de27..40f668c 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java @@ -366,18 +366,23 @@ public enum RunBy { * gzipped
* args[4]: output record limit: can be used to produce a "light" * set of RDF. -1 to output all records, i.e. no limit
- *
+ * args[5]: list of comma-delimited taxonomy identifiers (from + * NCBI Taxonomy) that will be used to limit RDF generation where + * applicable, e.g. 9606 to convert only human-related database + * records to RDF
+ * * The remaining input arguments depend on args[0]:
* if NAME:
- * args[5]: name of the FileDataSource to process
- * args[6]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not + * args[6]: comma-delimited list of FileDataSource names to + * process
+ * args[7]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not * included or if "null" then the current date will be used
*
* if INDEX:
- * args[5]: start stage args
- * [6]: the number of stages to process
- * args[7]: the Split type: either BY_STAGES or NONE
- * if BY_STAGES, then the index in args[5] corresponds to a + * args[6]: start stage args
+ * [7]: the number of stages to process
+ * args[8]: the Split type: either BY_STAGES or NONE
+ * if BY_STAGES, then the index in args[6] corresponds to a * particular stage of a FileDataSource. Many of the * FileDataSources are processed in a single stage, however some * of the larger files are split into multiple stages to speed up @@ -387,7 +392,7 @@ public enum RunBy { * stage. This will result in longer execution times for the * larger files, however duplicate triple removal can be done * concurrently.
- * args[8]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not + * args[9]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not * included or if "null" then the current date will be used * */ @@ -436,10 +441,13 @@ public static void main(String[] args) { break; case NAME: - FileDataSource source = FileDataSource.valueOf(args[index++].toUpperCase()); - time = getTime(args, index); - generateIceRdf(source, time, baseSourceFileDirectory, baseRdfOutputDirectory, cleanSourceFiles, - compress, outputRecordLimit, taxonIds); + String datasourceStr = args[index++].toUpperCase(); + for (String ds : datasourceStr.split(",")) { + FileDataSource source = FileDataSource.valueOf(ds); + time = getTime(args, index); + generateIceRdf(source, time, baseSourceFileDirectory, baseRdfOutputDirectory, cleanSourceFiles, + compress, outputRecordLimit, taxonIds); + } break; default: throw new IllegalArgumentException("Unhandled RunBy option: " + runBy.name()); From 5e596946b3219aa38ec146a2495debef3986712f Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 18:03:19 -0700 Subject: [PATCH 15/36] Added requiresManualDownload flag Also reformulated the output produced by main. It now prints a listing of available datasources indicating which require manual download. --- .../rdfizer/rdf/ice/FileDataSource.java | 451 ++++++------------ .../rdfizer/rdf/ice/FileDataSourceParams.java | 49 ++ 2 files changed, 195 insertions(+), 305 deletions(-) create mode 100644 datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSourceParams.java diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java index e50597d..b2158bf 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSource.java @@ -35,7 +35,9 @@ import java.io.File; import java.io.IOException; -import java.util.Arrays; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; import java.util.Set; import org.apache.log4j.Logger; @@ -43,7 +45,6 @@ import edu.ucdenver.ccp.common.file.CharacterEncoding; import edu.ucdenver.ccp.common.file.FileUtil; import edu.ucdenver.ccp.datasource.fileparsers.FileRecordReader; -import edu.ucdenver.ccp.datasource.fileparsers.dip.DipYYYYMMDDFileParser; import edu.ucdenver.ccp.datasource.fileparsers.drugbank.DrugbankXmlFileRecordReader; import edu.ucdenver.ccp.datasource.fileparsers.ebi.goa.GpAssociationGoaUniprotFileParser; import edu.ucdenver.ccp.datasource.fileparsers.ebi.interpro.InterPro2GoFileParser; @@ -84,11 +85,12 @@ import edu.ucdenver.ccp.datasource.fileparsers.rgd.RgdRatGeneNboAnnotationFileRecordReader; import edu.ucdenver.ccp.datasource.fileparsers.rgd.RgdRatGenePwAnnotationFileRecordReader; import edu.ucdenver.ccp.datasource.fileparsers.rgd.RgdRatGeneRdoAnnotationFileRecordReader; -import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader; import edu.ucdenver.ccp.datasource.fileparsers.transfac.TransfacGeneDatFileParser; import edu.ucdenver.ccp.datasource.fileparsers.transfac.TransfacMatrixDatFileParser; import edu.ucdenver.ccp.datasource.identifiers.DataSource; import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID; +import edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.FileDataSourceParams.IsTaxonAware; +import edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.FileDataSourceParams.RequiresManualDownload; /** * This enum separates RDF generation by data source file. It is intended to @@ -100,479 +102,328 @@ * ccpsupport@ucdenver.edu * */ + public enum FileDataSource { - /* - * DIP is now part of IRefWeb, so it has been commented out since it requires the extra manual step - * of logging in to the DIP website and downloading the file (and IRefWeb does not). - */ -// /** -// * The DIP data file must be obtained manually. It is assumed to already be -// * in place when RDF generation commences. It must be the only file in the -// * DIP data source directory. -// * -// */ -// DIP(DataSource.DIP) { -// -// @Override -// protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, -// File idListDir, Set taxonIds) throws IOException { -// logger.info("sourceFileDirectory (exists): (" + sourceFileDirectory.exists() + ")" + sourceFileDirectory); -// logger.info("file listing: " + Arrays.toString(sourceFileDirectory.listFiles())); -// File dipDataFile = sourceFileDirectory.listFiles()[0]; -// logger.info("File exists: " + dipDataFile.exists() + " -- " + dipDataFile.getAbsolutePath()); -// FileUtil.validateFile(dipDataFile); -// return new DipYYYYMMDDFileParser(dipDataFile, CharacterEncoding.US_ASCII, taxonIds); -// } -// -// @Override -// protected boolean isTaxonAware() { -// return true; -// } -// }, + * DIP is now part of IRefWeb, so it has been commented out since it + * requires the extra manual step of logging in to the DIP website and + * downloading the file (and IRefWeb does not). + */ + // /** + // * The DIP data file must be obtained manually. It is assumed to already + // be + // * in place when RDF generation commences. It must be the only file in the + // * DIP data source directory. + // * + // */ + // DIP(DataSource.DIP) { + // + // @Override + // protected FileRecordReader initFileRecordReader(File + // sourceFileDirectory, boolean cleanSourceFiles, + // File idListDir, Set taxonIds) throws IOException { + // logger.info("sourceFileDirectory (exists): (" + + // sourceFileDirectory.exists() + ")" + sourceFileDirectory); + // logger.info("file listing: " + + // Arrays.toString(sourceFileDirectory.listFiles())); + // File dipDataFile = sourceFileDirectory.listFiles()[0]; + // logger.info("File exists: " + dipDataFile.exists() + " -- " + + // dipDataFile.getAbsolutePath()); + // FileUtil.validateFile(dipDataFile); + // return new DipYYYYMMDDFileParser(dipDataFile, CharacterEncoding.US_ASCII, + // taxonIds); + // } + // + // @Override + // protected boolean isTaxonAware() { + // return true; + // } + // }, /** - * + * */ - PHARMGKB_DISEASE(DataSource.PHARMGKB) { + PHARMGKB_DISEASE(DataSource.PHARMGKB, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new PharmGkbDiseaseFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - PHARMGKB_GENE(DataSource.PHARMGKB) { + PHARMGKB_GENE(DataSource.PHARMGKB, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new PharmGkbGeneFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - - - PHARMGKB_DRUG(DataSource.PHARMGKB) { + PHARMGKB_DRUG(DataSource.PHARMGKB, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new PharmGkbDrugFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * * */ - DRUGBANK(DataSource.DRUGBANK) { + DRUGBANK(DataSource.DRUGBANK, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new DrugbankXmlFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - HGNC(DataSource.HGNC) { + HGNC(DataSource.HGNC, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new HgncDownloadFileParser(sourceFileDirectory, cleanSourceFiles, WithdrawnRecordTreatment.IGNORE); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - HOMOLOGENE(DataSource.HOMOLOGENE) { + HOMOLOGENE(DataSource.HOMOLOGENE, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new HomoloGeneDataFileParser(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** * */ - IREFWEB(DataSource.IREFWEB) { + IREFWEB(DataSource.IREFWEB, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new IRefWebPsiMitab2_6FileParser(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** * * */ - MGI_ENTREZGENE(DataSource.MGI) { + MGI_ENTREZGENE(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MGIEntrezGeneFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MGI_MGIPHENOGENO(DataSource.MGI) { + MGI_MGIPHENOGENO(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MGIPhenoGenoMPFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MGI_MRKLIST(DataSource.MGI) { + MGI_MRKLIST(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MRKListFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MGI_MRKREFERENCE(DataSource.MGI) { + MGI_MRKREFERENCE(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MRKReferenceFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MGI_MRKSEQUENCE(DataSource.MGI) { + MGI_MRKSEQUENCE(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MRKSequenceFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MGI_MRKSWISSPROT(DataSource.MGI) { + MGI_MRKSWISSPROT(DataSource.MGI, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MRKSwissProtFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - MIRBASE(DataSource.MIRBASE) { + MIRBASE(DataSource.MIRBASE, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MirBaseMiRnaDatFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - + /** * */ - RGD_GENES(DataSource.RGD) { + RGD_GENES(DataSource.RGD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RgdRatGeneFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - RGD_GENE_MP(DataSource.RGD) { + RGD_GENE_MP(DataSource.RGD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RgdRatGeneMpAnnotationFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - RGD_GENE_RDO(DataSource.RGD) { + RGD_GENE_RDO(DataSource.RGD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RgdRatGeneRdoAnnotationFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - RGD_GENE_NBO(DataSource.RGD) { + RGD_GENE_NBO(DataSource.RGD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RgdRatGeneNboAnnotationFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - RGD_GENE_PW(DataSource.RGD) { + RGD_GENE_PW(DataSource.RGD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RgdRatGenePwAnnotationFileRecordReader(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - PREMOD_HUMAN(DataSource.PREMOD) { + PREMOD_HUMAN(DataSource.PREMOD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new HumanPReModModuleTabFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - PREMOD_MOUSE(DataSource.PREMOD) { + PREMOD_MOUSE(DataSource.PREMOD, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new MousePReModModuleTabFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - PR_MAPPINGFILE(DataSource.PR) { + PR_MAPPINGFILE(DataSource.PR, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new ProMappingFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - REACTOME_UNIPROT2PATHWAYSTID(DataSource.REACTOME) { + REACTOME_UNIPROT2PATHWAYSTID(DataSource.REACTOME, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new ReactomeUniprot2PathwayStidTxtFileParser(sourceFileDirectory, cleanSourceFiles, idListDir, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** * */ - REFSEQ_RELEASECATALOG(DataSource.REFSEQ, 3) { + REFSEQ_RELEASECATALOG(DataSource.REFSEQ, 3, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new RefSeqReleaseCatalogFileParser(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** * */ - NCBIGENE_GENE2REFSEQ(DataSource.EG) { + NCBIGENE_GENE2REFSEQ(DataSource.EG, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new EntrezGene2RefseqFileParser(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, - NCBIGENE_GENEINFO(DataSource.EG) { + NCBIGENE_GENEINFO(DataSource.EG, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new EntrezGeneInfoFileParser(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } - }, - NCBIGENE_MIM2GENE(DataSource.EG) { + NCBIGENE_MIM2GENE(DataSource.EG, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new EntrezGeneMim2GeneFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - NCBIGENE_REFSEQUNIPROTCOLLAB(DataSource.EG) { + NCBIGENE_REFSEQUNIPROTCOLLAB(DataSource.EG, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new EntrezGeneRefSeqUniprotKbCollabFileParser(sourceFileDirectory, cleanSourceFiles, idListDir, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** */ - GOA(DataSource.GOA, 13) { + GOA(DataSource.GOA, 13, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new GpAssociationGoaUniprotFileParser(sourceFileDirectory, cleanSourceFiles, idListDir, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, /** */ - UNIPROT_SWISSPROT(DataSource.UNIPROT) { + UNIPROT_SWISSPROT(DataSource.UNIPROT, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new SwissProtXmlFileRecordReader(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, - UNIPROT_IDMAPPING(DataSource.UNIPROT, 3) { + UNIPROT_IDMAPPING(DataSource.UNIPROT, 3, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new UniProtIDMappingFileRecordReader(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, // UNIPROT_TREMBL(DataSource.UNIPROT, 33, 1000000) { // @Override @@ -584,65 +435,44 @@ protected boolean isTaxonAware() { // cleanSourceFiles, taxonIds); // } // }, - UNIPROT_TREMBL_SPARSE(DataSource.UNIPROT, 33, 1000000) { + UNIPROT_TREMBL_SPARSE(DataSource.UNIPROT, 33, 1000000, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new SparseTremblXmlFileRecordReader(sourceFileDirectory, cleanSourceFiles, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, - /** * */ - INTERPRO_NAMESDAT(DataSource.INTERPRO) { + INTERPRO_NAMESDAT(DataSource.INTERPRO, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new InterProNamesDatFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - INTERPRO_INTERPRO2GO(DataSource.INTERPRO) { + INTERPRO_INTERPRO2GO(DataSource.INTERPRO, IsTaxonAware.NO, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new InterPro2GoFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - INTERPRO_PROTEIN2IPR(DataSource.INTERPRO, 13) { + INTERPRO_PROTEIN2IPR(DataSource.INTERPRO, 13, IsTaxonAware.YES, RequiresManualDownload.NO) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new InterProProtein2IprDatFileParser(sourceFileDirectory, cleanSourceFiles, idListDir, taxonIds); } - - @Override - protected boolean isTaxonAware() { - return true; - } }, - + /** * The HPRD HPRD_ID_MAPPINGS.txt file must be obtained manually. It is * assumed to already be in place when RDF generation commences. */ - HPRD_ID_MAPPINGS(DataSource.HPRD) { + HPRD_ID_MAPPINGS(DataSource.HPRD, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, @@ -652,17 +482,12 @@ protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boo FileUtil.validateFile(hprdIdMappingFile); return new HprdIdMappingsTxtFileParser(hprdIdMappingFile, CharacterEncoding.US_ASCII); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * The TRANSFAC gene.dat and matrix.dat files must be obtained manually. * They are assumed to already be in place when RDF generation commences. */ - TRANSFAC_GENE(DataSource.TRANSFAC) { + TRANSFAC_GENE(DataSource.TRANSFAC, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { @@ -670,14 +495,9 @@ protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boo FileUtil.validateFile(transfacGeneDatFile); return new TransfacGeneDatFileParser(transfacGeneDatFile, CharacterEncoding.ISO_8859_1); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - TRANSFAC_MATRIX(DataSource.TRANSFAC) { + TRANSFAC_MATRIX(DataSource.TRANSFAC, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { @@ -685,17 +505,12 @@ protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boo FileUtil.validateFile(transfacMatrixDatFile); return new TransfacMatrixDatFileParser(transfacMatrixDatFile, CharacterEncoding.ISO_8859_1); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * The GAD all.txt data file must be obtained manually. It is assumed to * already be in place when RDF generation commences. */ - GAD(DataSource.GAD) { + GAD(DataSource.GAD, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { @@ -704,41 +519,25 @@ protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boo FileUtil.validateFile(gadAllTxtFile); return new GeneticAssociationDbAllTxtFileParser(gadAllTxtFile, CharacterEncoding.US_ASCII); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, /** * */ - OMIM(DataSource.OMIM) { + OMIM(DataSource.OMIM, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { return new OmimTxtFileParser(sourceFileDirectory, cleanSourceFiles); } - - @Override - protected boolean isTaxonAware() { - return false; - } }, - PHARMGKB_RELATION(DataSource.PHARMGKB) { + PHARMGKB_RELATION(DataSource.PHARMGKB, IsTaxonAware.NO, RequiresManualDownload.YES) { @Override protected FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListDir, Set taxonIds) throws IOException { File pharmgkbRelationshipsDataFile = new File(sourceFileDirectory, "relationships.tsv"); return new PharmGkbRelationFileParser(pharmgkbRelationshipsDataFile, CharacterEncoding.UTF_8); } - - @Override - protected boolean isTaxonAware() { - return false; - } - } - ; + }; public enum Split { BY_STAGES, NONE; @@ -774,22 +573,35 @@ public enum Split { private final Long blockRecordCount; - private FileDataSource(DataSource dataSource, int numberOfStages, long blockRecordCount) { + private final IsTaxonAware isTaxonAware; + + private final RequiresManualDownload requiresManualDownload; + + private FileDataSource(DataSource dataSource, int numberOfStages, long blockRecordCount, IsTaxonAware isTaxonAware, + RequiresManualDownload requiresManualDownload) { this.dataSource = dataSource; this.numberOfStages = numberOfStages; this.blockRecordCount = blockRecordCount; + this.isTaxonAware = isTaxonAware; + this.requiresManualDownload = requiresManualDownload; } - private FileDataSource(DataSource dataSource, int numberOfStages) { + private FileDataSource(DataSource dataSource, int numberOfStages, IsTaxonAware isTaxonAware, + RequiresManualDownload requiresManualDownload) { this.dataSource = dataSource; this.numberOfStages = numberOfStages; this.blockRecordCount = null; + this.isTaxonAware = isTaxonAware; + this.requiresManualDownload = requiresManualDownload; } - private FileDataSource(DataSource dataSource) { + private FileDataSource(DataSource dataSource, IsTaxonAware isTaxonAware, + RequiresManualDownload requiresManualDownload) { this.dataSource = dataSource; this.numberOfStages = 1; this.blockRecordCount = null; + this.isTaxonAware = isTaxonAware; + this.requiresManualDownload = requiresManualDownload; } public DataSource dataSource() { @@ -800,6 +612,14 @@ public Long blockRecordCount() { return blockRecordCount; } + public boolean isTaxonAware() { + return isTaxonAware == IsTaxonAware.YES; + } + + public boolean requiresManualDownload() { + return requiresManualDownload == RequiresManualDownload.YES; + } + // /** // * @param stageNum // * @param baseSourceFileDirectory @@ -862,8 +682,6 @@ public Long blockRecordCount() { protected abstract FileRecordReader initFileRecordReader(File sourceFileDirectory, boolean cleanSourceFiles, File idListFileDirectory, Set taxonIds) throws IOException; - protected abstract boolean isTaxonAware(); - // /** // * To be implemented by each DataSourceRdfGenerator instance. // * @@ -986,23 +804,46 @@ public int getNumberOfStages() { * @param args */ public static void main(String[] args) { - int stageCount = 0; - System.out.println("BY STAGES: "); - for (FileDataSource source : FileDataSource.values()) { - for (int i = 0; i < source.getNumberOfStages(); i++) { - System.out.println("Global Stage: " + (i + 1 + stageCount) + " ==> " + source.name() + " Local Stage: " - + (i + 1)); + // int stageCount = 0; + // System.out.println("BY STAGES: "); + // for (FileDataSource source : FileDataSource.values()) { + // for (int i = 0; i < source.getNumberOfStages(); i++) { + // System.out.println("Global Stage: " + (i + 1 + stageCount) + " ==> " + // + source.name() + " Local Stage: " + // + (i + 1)); + // } + // stageCount += source.getNumberOfStages(); + // } + // System.out.println("Total # of stages: " + stageCount + "\n\n"); + // stageCount = 0; + // System.out.println("SINGLE STAGE PER SOURCE:"); + // for (FileDataSource source : FileDataSource.values()) { + // System.out.println("SGE index: " + (stageCount + 1) + " ==> " + + // source.name()); + // stageCount++; + // } + + List autoDownloadSources = new ArrayList(); + List manualDownloadSources = new ArrayList(); + + for (FileDataSource fds : values()) { + if (fds.requiresManualDownload()) { + manualDownloadSources.add(fds.name()); + } else { + autoDownloadSources.add(fds.name()); } - stageCount += source.getNumberOfStages(); } - System.out.println("Total # of stages: " + stageCount + "\n\n"); - stageCount = 0; - System.out.println("SINGLE STAGE PER SOURCE:"); - for (FileDataSource source : FileDataSource.values()) { - System.out.println("SGE index: " + (stageCount + 1) + " ==> " + source.name()); - stageCount++; + + Collections.sort(autoDownloadSources); + Collections.sort(manualDownloadSources); + + for (String name : autoDownloadSources) { + System.out.println("DS: " + name); + } + System.out.println("DS: ==== BELOW REQUIRE MANUAL DOWNLOAD OF DATA SOURCE FILE ===="); + for (String name : manualDownloadSources) { + System.out.println("DS: ==== " + name); } - } } diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSourceParams.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSourceParams.java new file mode 100644 index 0000000..283070f --- /dev/null +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/FileDataSourceParams.java @@ -0,0 +1,49 @@ +package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice; + +/* + * #%L + * Colorado Computational Pharmacology's datasource + * project + * %% + * Copyright (C) 2012 - 2016 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +public class FileDataSourceParams { + + public static enum RequiresManualDownload { + YES, + NO + } + + public static enum IsTaxonAware { + YES, + NO + } + +} From ca9d9ddf2dde74dc6a9de193187cdbaaf73f5b1e Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 18:04:04 -0700 Subject: [PATCH 16/36] scripts reformulated to work with datasource names dependence on the integer mapping to datasources has been removed --- ...wnload-datasources-and-generate-triples.sh | 23 ++++++++++--------- ...le-indices.sh => list-datasource-names.sh} | 7 +++--- .../scripts/pom-rdf-gen-9606.xml | 6 ++--- .../scripts/pom-rdf-gen-modelorgs.xml | 6 ++--- datasource-rdfizer/scripts/pom-rdf-gen.xml | 10 +++----- ....xml => pom-rdf-list-datasource-names.xml} | 0 6 files changed, 23 insertions(+), 29 deletions(-) rename datasource-rdfizer/scripts/{list-download-file-indices.sh => list-datasource-names.sh} (76%) rename datasource-rdfizer/scripts/{pom-rdf-gen-ids.xml => pom-rdf-list-datasource-names.xml} (100%) diff --git a/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh b/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh index 260a5ca..70f5035 100755 --- a/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh +++ b/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh @@ -9,8 +9,8 @@ function print_usage { echo "$(basename $0) [OPTIONS]" echo " <-d >: The directory into which to place the downloaded datasource files." echo " <-r >: The directory into which to place the RDF triples parsed from the datasource files." - echo " [-i ]: The indices of the datasources to download; if not specified, all available datasources will be downloaded." - echo " [-t ]: The names of the datasources to download; if not specified, all available datasources will be downloaded." + echo " [-t -classpath edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.IceRdfGenerator - INDEX + NAME ${baseSourceDir} ${baseRdfDir} ${compressRdf} ${outputRecordLimit} 9606 - ${startStage} - ${numStages} - NONE + ${datasourceNames} ${date} diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml index bd4c1bf..13acd49 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml @@ -35,15 +35,13 @@ -classpath edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.IceRdfGenerator - INDEX + NAME ${baseSourceDir} ${baseRdfDir} ${compressRdf} ${outputRecordLimit} 9606,741158,63221,10090,947985,80274,57486,477816,477815,46456,35531,179238,1266728,116058,10092,10091,39442,10116,947987,7227,4932,947046,947045,947044,947043,947042,947041,947040,947039,947038,947037,947036,947035,929629,929587,929586,929585,927258,927256,889517,765312,764102,764101,764100,764099,764098,764097,721032,717647,658763,643680,614665,614664,580240,580239,574961,545124,538976,538975,502869,471861,471859,471510,468558,466209,464025,462210,462209,41870,307796,285006,1247190,1227742,1220494,1218710,1216859,1216345,1204498,1201112,1196866,1182968,1182967,1182966,1177187,1162674,1162673,1162672,1162671,1158205,1158204,1149757,1144731,1138861,1097555,1095001,1087981,559292,6239,7955,3702 - ${startStage} - ${numStages} - NONE + ${datasourceNames} ${date} diff --git a/datasource-rdfizer/scripts/pom-rdf-gen.xml b/datasource-rdfizer/scripts/pom-rdf-gen.xml index 0ab015d..c9f880b 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen.xml @@ -35,7 +35,7 @@ -classpath edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.IceRdfGenerator - INDEX + NAME ${baseSourceDir} ${taxonIDs} - - ${startStage} - - ${numStages} - - NONE + + ${datasourceNames} ${date} diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-ids.xml b/datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml similarity index 100% rename from datasource-rdfizer/scripts/pom-rdf-gen-ids.xml rename to datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml From 10962afaaf097bcfd5346ae14590314a939cca00 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 21:30:50 -0700 Subject: [PATCH 17/36] updated expected file header ID --> IDs --- .../ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java index 56e2ac3..2ab6dbe 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java @@ -66,8 +66,7 @@ * */ public class MRKSequenceFileParser extends SingleLineFileRecordReader { - - private static final String HEADER = "MGI Marker Accession ID\tMarker Symbol\tStatus\tMarker Type\tMarker Name\tcM position\tChromosome\tGenome Coordinate Start\tGenome Coordinate End\tStrand\tGenBank ID\tRefSeq transcript ID\tVEGA transcript ID\tEnsembl transcript ID\tUniProt ID\tTrEMBL ID\tVEGA protein ID\tEnsembl protein ID\tRefSeq protein ID\tUniGene ID"; + private static final String HEADER = "MGI Marker Accession ID\tMarker Symbol\tStatus\tMarker Type\tMarker Name\tcM position\tChromosome\tGenome Coordinate Start\tGenome Coordinate End\tStrand\tGenBank IDs\tRefSeq transcript IDs\tVEGA transcript IDs\tEnsembl transcript IDs\tUniProt IDs\tTrEMBL IDs\tVEGA protein IDs\tEnsembl protein IDs\tRefSeq protein IDs\tUniGene IDs"; private static final Logger logger = Logger.getLogger(MRKSequenceFileParser.class); From 8388b8586dfa23e0227c5f044479aa1c57950bce Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Mon, 8 Feb 2016 21:36:41 -0700 Subject: [PATCH 18/36] Added new field: comment --- .../ncbi/gene/EntrezGeneMim2GeneFileData.java | 13 +++++++++---- .../ncbi/gene/EntrezGeneMim2GeneFileParser.java | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileData.java index bec2aef..c531dd8 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileData.java @@ -79,19 +79,23 @@ public class EntrezGeneMim2GeneFileData extends SingleLineFileRecord { @RecordField(comment = "The accession assigned by MedGen to this phenotype. If the accession starts with a C followed by integers, the identifier is a concept ID (CUI) from UMLS. http://www.nlm.nih.gov/research/umls/ If it starts with a CN, no CUI in UMLS was identified, and NCBI created a placeholder.") private final MedGenId medGenId; + @RecordField + private final String comment; + public EntrezGeneMim2GeneFileData(OmimID mimNumber, EntrezGeneID entrezGeneID, String associationType, - Set sources, MedGenId medGenId, long byteOffset, long lineNumber) { + Set sources, MedGenId medGenId, String comment, long byteOffset, long lineNumber) { super(byteOffset, lineNumber); this.mimNumber = mimNumber; this.entrezGeneID = entrezGeneID; this.associationType = associationType; this.sources = sources; this.medGenId = medGenId; + this.comment = comment; } public static EntrezGeneMim2GeneFileData parseMim2GeneLine(Line line) { - String[] toks = line.getText().split("\\t"); - if (toks.length == 5) { + String[] toks = line.getText().split("\\t", -1); + if (toks.length == 6) { OmimID mimNumber = new OmimID(toks[0]); EntrezGeneID entrezGeneID = (toks[1].equals("-")) ? null : new EntrezGeneID(toks[1]); String associationType = toks[2]; @@ -103,7 +107,8 @@ public static EntrezGeneMim2GeneFileData parseMim2GeneLine(Line line) { } } MedGenId medGenId = (toks[4].equals("-")) ? null : new MedGenId(toks[4].trim()); - return new EntrezGeneMim2GeneFileData(mimNumber, entrezGeneID, associationType, sources, medGenId, + String comment = (toks[5].equals("-")) ? null : toks[5].trim(); + return new EntrezGeneMim2GeneFileData(mimNumber, entrezGeneID, associationType, sources, medGenId, comment, line.getByteOffset(), line.getLineNumber()); } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileParser.java index 64ed4ac..7042ccc 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileParser.java @@ -52,7 +52,7 @@ */ public class EntrezGeneMim2GeneFileParser extends SingleLineFileRecordReader { - private static final String HEADER = "#MIM number\tGeneID\ttype\tSource\tMedGenCUI"; + private static final String HEADER = "#MIM number\tGeneID\ttype\tSource\tMedGenCUI\tComment"; public static final String FTP_FILE_NAME = "mim2gene_medgen"; public static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; From c139c7875e398072fdbe9e27adbd76654f270219 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 10:00:51 -0700 Subject: [PATCH 19/36] Updated allowed accession prefixes --- .../NucleotideAccessionResolver.java | 49 +++++++++++-------- .../identifiers/ProteinAccessionResolver.java | 18 ++++++- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java index f192f76..52e4ec5 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java @@ -66,23 +66,26 @@ public class NucleotideAccessionResolver { private static final Pattern ACC_PATTERN = Pattern.compile("([A-Z]+)\\d+\\.?\\d*"); - private static final Set GENBANK_ID_PREFIXES = CollectionsUtil.createSet("CH", "CM", "DS", "EM", "EN", - "EP", "EQ", "FA", "GG", "GL", "JH", "KB", "H", "N", "T", "R", "W", "AA", "AI", "AW", "BE", "BF", "BG", - "BI", "BM", "BQ", "BU", "CA", "CB", "CD", "CF", "CK", "CN", "CO", "CV", "CX", "DN", "DR", "DT", "DV", "DW", - "DY", "EB", "EC", "EE", "EG", "EH", "EL", "ES", "EV", "EW", "EX", "EY", "FC", "FD", "FE", "FF", "FG", "FK", - "FL", "GD", "GE", "GH", "GO", "GR", "GT", "GW", "HO", "HS", "JG", "JK", "JZ", "U", "AF", "AY", "DQ", "EF", - "EU", "FJ", "GQ", "GU", "HM", "HQ", "JF", "JN", "JQ", "JX", "KC", "AE", "CP", "CY", "B", "AQ", "AZ", "BH", - "BZ", "CC", "CE", "CG", "CL", "CW", "CZ", "DU", "DX", "ED", "EI", "EJ", "EK", "ER", "ET", "FH", "FI", "GS", - "HN", "HR", "JJ", "JM", "JS", "JY", "AC", "DP", "I", "AR", "DZ", "EA", "GC", "GP", "GV", "GX", "GY", "GZ", - "HJ", "HK", "HL", "G", "BV", "GF", "BK", "BL", "GJ", "GK", "EZ", "HP", "JI", "JL", "JO", "JP", "JR", "JT", - "JU", "JV", "JW", "KA", "S", "AD", "AH", "AS", "BC", "BT", "J", "K", "L", "M", "N"); + private static final Set GENBANK_ID_PREFIXES = CollectionsUtil.createSet("H", "N", "T", "R", "W", "AA", + "AI", "AW", "BE", "BF", "BG", "BI", "BM", "BQ", "BU", "CA", "CB", "CD", "CF", "CK", "CN", "CO", "CV", "CX", + "DN", "DR", "DT", "DV", "DW", "DY", "EB", "EC", "EE", "EG", "EH", "EL", "ES", "EV", "EW", "EX", "EY", "FC", + "FD", "FE", "FF", "FG", "FK", "FL", "GD", "GE", "GH", "GO", "GR", "GT", "GW", "HO", "HS", "JG", "JK", "JZ", + "U", "AF", "AY", "DQ", "EF", "EU", "FJ", "GQ", "GU", "HM", "HQ", "JF", "JN", "JQ", "JX", "KC", "KF", "KJ", + "KM", "KP", "KR", "KT", "KU", "AE", "CP", "CY", "B", "AQ", "AZ", "BH", "BZ", "CC", "CE", "CG", "CL", "CW", + "CZ", "DU", "DX", "ED", "EI", "EJ", "EK", "ER", "ET", "FH", "FI", "GS", "HN", "HR", "JJ", "JM", "JS", "JY", + "KG", "KO", "KS", "AC", "DP", "I", "AR", "DZ", "EA", "GC", "GP", "GV", "GX", "GY", "GZ", "HJ", "HK", "HL", + "G", "BV", "GF", "BK", "BL", "GJ", "GK", "EZ", "HP", "JI", "JL", "JO", "JP", "JR", "JT", "JU", "JV", "JW", + "KA", "S", "AD", "AH", "AS", "BC", "BT", "J", "K", "L", "M", "N", "CH", "CM", "DS", "EM", "EN", "EP", "EQ", + "FA", "GG", "GL", "JH", "KB", "KD", "KE", "KI", "KK", "KL", "KN", "KQ", "KV"); private static final Set EMBL_ID_PREFIXES = CollectionsUtil.createSet("AN", "F", "V", "X", "Y", "Z", "AJ", - "AM", "FM", "FN", "HE", "HF", "HG", "FO", "AL", "BX", "CR", "CT", "CU", "FP", "FQ", "FR", "A", "AX", "CQ", - "CS", "FB", "GM", "GN", "HA", "HB", "HC", "HD", "HH", "HI", "JA", "JB", "JC", "JD", "JE", "BN"); - private static final Set DDBJ_ID_PREFIXES = CollectionsUtil.createSet("BA", "DF", "DG", "C", "AT", "AU", - "AV", "BB", "BJ", "BP", "BW", "BY", "CI", "CJ", "DA", "DB", "DC", "DK", "FS", "FY", "HX", "HY", "D", "AB", - "AP", "BS", "AG", "DE", "DH", "FT", "GA", "AK", "E", "BD", "DD", "DI", "DJ", "DL", "DM", "FU", "FV", "FW", - "FZ", "GB", "HV", "HW", "BR", "HT", "HU", "FX"); + "AM", "FM", "FN", "HE", "HF", "HG", "FO", "LK", "LL", "LM", "LN", "LO", "LP", "LQ", "LR", "LS", "LT", "AL", + "BX", "CR", "CT", "CU", "FP", "FQ", "FR", "A", "AX", "CQ", "CS", "FB", "GM", "GN", "HA", "HB", "HC", "HD", + "HH", "HI", "JA", "JB", "JC", "JD", "JE", "BN"); + private static final Set DDBJ_ID_PREFIXES = CollectionsUtil.createSet("BA", "DF", "DG", "LD", "C", "AT", + "AU", "AV", "BB", "BJ", "BP", "BW", "BY", "CI", "CJ", "DA", "DB", "DC", "DK", "FS", "FY", "HX", "HY", "D", + "AB", "LC", "AP", "BS", "AG", "DE", "DH", "FT", "GA", "LB", "AK", "E", "BD", "DD", "DI", "DJ", "DL", "DM", + "FU", "FV", "FW", "FZ", "GB", "HV", "HW", "HZ", "LF", "LG", "BR", "HT", "HU", "FX", "LA", "LE", "LH", "LI", + "LJ"); private static Map>> prefixToIdClass; @@ -113,13 +116,19 @@ public static DataSourceIdentifier resolveNucleotideAccession(String acc if (prefix.length() == 5 && prefix.startsWith("A")) { return new DdbjId(acc); } - if (prefix.length() == 4 && (prefix.startsWith("A") || prefix.startsWith("D") || prefix.startsWith("G"))) { + if (prefix.length() == 4 + && (prefix.startsWith("A") || prefix.startsWith("D") || prefix.startsWith("G") + || prefix.startsWith("J") || prefix.startsWith("L") || prefix.startsWith("M") + || prefix.startsWith("N") || prefix.startsWith("K"))) { return new GenBankID(acc); } - if (prefix.length() == 4 && (prefix.startsWith("B") || prefix.startsWith("E"))) { + if (prefix.length() == 4 + && (prefix.startsWith("B") || prefix.startsWith("P") || prefix.startsWith("E") || prefix + .startsWith("I"))) { return new DdbjId(acc); } - if (prefix.length() == 4 && prefix.startsWith("C")) { + if (prefix.length() == 4 && prefix.startsWith("C") || prefix.startsWith("F") || prefix.startsWith("O") + || prefix.startsWith("H")) { return new EmblID(acc); } Class> idClass = prefixToIdClass.get(prefix); @@ -141,7 +150,7 @@ public static DataSourceIdentifier resolveNucleotideAccession(String acc } } } - logger.warn("Input is not a known nucleotide accession: " + acc); + // logger.warn("Input is not a known nucleotide accession: " + acc); return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known nucleotide accession: " + acc); } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java index 3827dc0..0efbf90 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java @@ -56,11 +56,12 @@ * */ public class ProteinAccessionResolver { - + private static final Logger logger = Logger.getLogger(ProteinAccessionResolver.class); private static final Pattern ACC_PATTERN = Pattern.compile("([A-Z]{3})\\d+\\.?\\d*"); private static final String VALID_UNIPROT_PATTERN_1 = "[A-NR-Z][0-9][A-Z][A-Z0-9][A-Z0-9][0-9]"; + private static final String VALID_UNIPROT_PATTERN_3 = "[A-NR-Z][0-9][A-Z][A-Z0-9][A-Z0-9][0-9][A-Z][A-Z0-9][A-Z0-9][0-9]"; private static final String VALID_UNIPROT_PATTERN_2 = "[OPQ][0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]"; public static DataSourceIdentifier resolveProteinAccession(String acc) { @@ -68,7 +69,8 @@ public static DataSourceIdentifier resolveProteinAccession(String acc) { if (acc.matches("[A-Z][A-Z]_\\d+\\.?\\d*")) { return new RefSeqID(acc); } - if (acc.matches(VALID_UNIPROT_PATTERN_1) || acc.matches(VALID_UNIPROT_PATTERN_2)) { + if (acc.matches(VALID_UNIPROT_PATTERN_1) || acc.matches(VALID_UNIPROT_PATTERN_2) + || acc.matches(VALID_UNIPROT_PATTERN_3)) { return new UniProtID(acc); } Matcher m = ACC_PATTERN.matcher(acc); @@ -104,6 +106,18 @@ public static DataSourceIdentifier resolveProteinAccession(String acc) { if (prefix.startsWith("J")) { return new GenBankID(acc); } + if (prefix.startsWith("K")) { + return new GenBankID(acc); + } + if (prefix.startsWith("L")) { + return new DdbjId(acc); + } + if (prefix.startsWith("M")) { + return new GenBankID(acc); + } + if (prefix.startsWith("N")) { + return new GenBankID(acc); + } } logger.warn("Input is not a known protein accession pattern: " + acc); return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known protein accession pattern: " From b98deaa6448e01f6172487c7cb7b22edd11e30a8 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 10:01:28 -0700 Subject: [PATCH 20/36] Removed check made invalid by unknown identifier handling --- .../rdf/filter/DuplicateFieldValueFilter.java | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/filter/DuplicateFieldValueFilter.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/filter/DuplicateFieldValueFilter.java index 9100772..314f2b4 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/filter/DuplicateFieldValueFilter.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/filter/DuplicateFieldValueFilter.java @@ -66,7 +66,7 @@ public abstract class DuplicateFieldValueFilter implements DuplicateStatementFil private final DiskBasedHash hash; private List noDupsFiles; private String previousFieldValueKey = null; - private int _3inARowCount = 0; +// private int _3inARowCount = 0; public DuplicateFieldValueFilter(DiskBasedHash hash) throws IOException { this.hash = hash; @@ -84,10 +84,11 @@ public boolean alreadyObservedStatement(Statement stmt) { if (isFieldRdfLine(subject)) { String fieldValueKey = getFieldValueKey(subject); if (!fieldValueKey.equals(previousFieldValueKey)) { - if (previousFieldValueKey != null && ((_3inARowCount % 3) != 0)) { - throw new IllegalStateException("3-in-a-row-count not equal to 3 (" + _3inARowCount + "): " - + previousFieldValueKey); - } + /* Handling of unknown and probable error identifiers seems to break the 3 in a row count*/ +// if (previousFieldValueKey != null && ((_3inARowCount % 3) != 0)) { +// throw new IllegalStateException("3-in-a-row-count not equal to 3 (" + _3inARowCount + "): " +// + previousFieldValueKey); +// } if (previousFieldValueKey != null) { try { hash.add(previousFieldValueKey); @@ -96,9 +97,9 @@ public boolean alreadyObservedStatement(Statement stmt) { } } previousFieldValueKey = fieldValueKey; - _3inARowCount = 1; +// _3inARowCount = 1; } else { - _3inARowCount++; +// _3inARowCount++; } if (!hash.contains(fieldValueKey)) { return false; From 499f6c5be644890715c2b89f4477fd2bfd983124 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 10:01:57 -0700 Subject: [PATCH 21/36] files are now closed properly when an exception halts processing --- .../rdfizer/rdf/ice/RdfRecordWriterImpl.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java index 2d6c6e5..9ad8a56 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/RdfRecordWriterImpl.java @@ -654,12 +654,22 @@ private void write(Statement stmt, DataSource ns) { // them in the filter. This saves some memory and also the time needed to check for // something that is guaranteed to not be already observed boolean checkFilter = needToCheckFilter(stmt.getSubject()); + try { if (!checkFilter || (checkFilter && !filter.alreadyObservedStatement(stmt))) { if (!rollingCacheContains(stmt)) { write(stmt, rdfWriter); writtenStatementCount++; } } + } catch(IllegalStateException e) { + logger.error("Halting RDF Generation due to IllegalStateException.", e); + try { + closeFiles(); + System.exit(-1); + } catch (IOException e1) { + e1.printStackTrace(); + } + } } /** From ee2a4abcc742a77ecb5a2555ccfedeca86c17a26 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 10:02:26 -0700 Subject: [PATCH 22/36] fixes to comply with added column (feature type) --- .../fileparsers/mgi/MRKSequenceFileData.java | 6 +++++- .../fileparsers/mgi/MRKSequenceFileParser.java | 16 ++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileData.java index 8290529..45a3e60 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileData.java @@ -98,6 +98,8 @@ public class MRKSequenceFileData extends SingleLineFileRecord { private final Set refseqProteinIds; @RecordField private final Set unigeneIds; + @RecordField + private final String featureType; /** @@ -122,6 +124,7 @@ public class MRKSequenceFileData extends SingleLineFileRecord { * @param vegaProteinIds * @param ensemblProteinIds * @param refseqProteinIds + * @param featureType */ public MRKSequenceFileData(MgiGeneID mgiAccessionID, String markerSymbol, String status, MgiGeneType markerType, String markerName, String cM_Position, String chromosome, @@ -129,7 +132,7 @@ public MRKSequenceFileData(MgiGeneID mgiAccessionID, String markerSymbol, String Set> genBankAccessionIDs, Set refseqTranscriptIds, Set vegaTranscriptIds, Set ensemblTranscriptId, Set uniprotIds, Set tremblIds, Set vegaProteinIds, Set ensemblProteinIds, - Set refseqProteinIds, Set unigeneIds, long byteOffset, long lineNumber) { + Set refseqProteinIds, Set unigeneIds, String featureType, long byteOffset, long lineNumber) { super(byteOffset, lineNumber); this.mgiAccessionID = mgiAccessionID; this.markerSymbol = markerSymbol; @@ -151,6 +154,7 @@ public MRKSequenceFileData(MgiGeneID mgiAccessionID, String markerSymbol, String this.ensemblProteinIds = ensemblProteinIds; this.refseqProteinIds = refseqProteinIds; this.unigeneIds = unigeneIds; + this.featureType = featureType; } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java index 2ab6dbe..a095524 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java @@ -66,6 +66,10 @@ * */ public class MRKSequenceFileParser extends SingleLineFileRecordReader { + /* + * There is a line break in the header. The final column header (Feature + * Type) is on the next line by itself. + */ private static final String HEADER = "MGI Marker Accession ID\tMarker Symbol\tStatus\tMarker Type\tMarker Name\tcM position\tChromosome\tGenome Coordinate Start\tGenome Coordinate End\tStrand\tGenBank IDs\tRefSeq transcript IDs\tVEGA transcript IDs\tEnsembl transcript IDs\tUniProt IDs\tTrEMBL IDs\tVEGA protein IDs\tEnsembl protein IDs\tRefSeq protein IDs\tUniGene IDs"; private static final Logger logger = Logger.getLogger(MRKSequenceFileParser.class); @@ -92,7 +96,13 @@ protected StreamLineReader initializeLineReaderFromDownload(CharacterEncoding en @Override protected String getFileHeader() throws IOException { - return readLine().getText(); + String header = readLine().getText(); + /* + * There is a line break in the header. The final column header (Feature + * Type) is on the next line by itself so we burn a line here. + */ + readLine(); + return header; } @Override @@ -223,11 +233,13 @@ protected MRKSequenceFileData parseRecordFromLine(Line line) { } } } + + String featureType = toks[20]; return new MRKSequenceFileData(mgiAccessionID, markerSymbol, status, markerType, markerName, cM_Position, chromosome, genomeCoordinateStart, genomeCoordinateEnd, strand, genBankAccessionIDs, refseqTranscriptIds, vegaTranscriptIds, ensemblTranscriptIds, uniprotIds, tremblIds, vegaProteinIds, - ensemblProteinIds, refseqProteinIds, unigeneIds, line.getByteOffset(), line.getLineNumber()); + ensemblProteinIds, refseqProteinIds, unigeneIds, featureType, line.getByteOffset(), line.getLineNumber()); } From 4a760add3032a4443045f10c9a7e4184e98d3163 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 10:32:18 -0700 Subject: [PATCH 23/36] Added flag to optionally clean data source files --- ...wnload-datasources-and-generate-triples.sh | 9 ++++- .../scripts/pom-rdf-gen-9606.xml | 1 + .../scripts/pom-rdf-gen-modelorgs.xml | 1 + datasource-rdfizer/scripts/pom-rdf-gen.xml | 36 ++++++++++--------- .../rdfizer/rdf/ice/IceRdfGenerator.java | 18 +++++----- 5 files changed, 39 insertions(+), 26 deletions(-) diff --git a/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh b/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh index 70f5035..3f15a45 100755 --- a/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh +++ b/datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh @@ -12,6 +12,7 @@ function print_usage { echo " [-i ]: The names of the datasources to download; if not specified, all available datasources will be downloaded." echo " [-t ${compressRdf} ${outputRecordLimit} 9606 + ${redownloadDataSourceFiles} ${datasourceNames} ${date} diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml index 13acd49..c42f271 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml @@ -41,6 +41,7 @@ ${compressRdf} ${outputRecordLimit} 9606,741158,63221,10090,947985,80274,57486,477816,477815,46456,35531,179238,1266728,116058,10092,10091,39442,10116,947987,7227,4932,947046,947045,947044,947043,947042,947041,947040,947039,947038,947037,947036,947035,929629,929587,929586,929585,927258,927256,889517,765312,764102,764101,764100,764099,764098,764097,721032,717647,658763,643680,614665,614664,580240,580239,574961,545124,538976,538975,502869,471861,471859,471510,468558,466209,464025,462210,462209,41870,307796,285006,1247190,1227742,1220494,1218710,1216859,1216345,1204498,1201112,1196866,1182968,1182967,1182966,1177187,1162674,1162673,1162672,1162671,1158205,1158204,1149757,1144731,1138861,1097555,1095001,1087981,559292,6239,7955,3702 + ${redownloadDataSourceFiles} ${datasourceNames} ${date} diff --git a/datasource-rdfizer/scripts/pom-rdf-gen.xml b/datasource-rdfizer/scripts/pom-rdf-gen.xml index c9f880b..5d275be 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen.xml @@ -35,23 +35,25 @@ -classpath edu.ucdenver.ccp.datasource.rdfizer.rdf.ice.IceRdfGenerator - NAME - - ${baseSourceDir} - - ${baseRdfDir} - - ${compressRdf} - - ${outputRecordLimit} - - ${taxonIDs} - - ${datasourceNames} - - ${date} + NAME + + ${baseSourceDir} + + ${baseRdfDir} + + ${compressRdf} + + ${outputRecordLimit} + + ${taxonIDs} + + ${redownloadDataSourceFiles} + + ${datasourceNames} + + ${date} diff --git a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java index 40f668c..50bdfa4 100644 --- a/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java +++ b/datasource-rdfizer/src/main/java/edu/ucdenver/ccp/datasource/rdfizer/rdf/ice/IceRdfGenerator.java @@ -370,19 +370,21 @@ public enum RunBy { * NCBI Taxonomy) that will be used to limit RDF generation where * applicable, e.g. 9606 to convert only human-related database * records to RDF
+ * args[6]: Clean data source files (if true, then the data + * source files will be deleted and re-downloaded) * * The remaining input arguments depend on args[0]:
* if NAME:
- * args[6]: comma-delimited list of FileDataSource names to + * args[7]: comma-delimited list of FileDataSource names to * process
- * args[7]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not + * args[8]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not * included or if "null" then the current date will be used
*
* if INDEX:
- * args[6]: start stage args
- * [7]: the number of stages to process
- * args[8]: the Split type: either BY_STAGES or NONE
- * if BY_STAGES, then the index in args[6] corresponds to a + * args[7]: start stage args
+ * args[8]: the number of stages to process
+ * args[9]: the Split type: either BY_STAGES or NONE
+ * if BY_STAGES, then the index in args[7] corresponds to a * particular stage of a FileDataSource. Many of the * FileDataSources are processed in a single stage, however some * of the larger files are split into multiple stages to speed up @@ -392,7 +394,7 @@ public enum RunBy { * stage. This will result in longer execution times for the * larger files, however duplicate triple removal can be done * concurrently.
- * args[9]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not + * args[10]: [OPTIONAL] date to use in the form yyyy-mm-dd. If not * included or if "null" then the current date will be used * */ @@ -410,7 +412,6 @@ public static void main(String[] args) { File baseSourceFileDirectory = new File(args[index++]); File baseRdfOutputDirectory = new File(args[index++]); - boolean cleanSourceFiles = false;// Boolean.valueOf(args[index++]); boolean compress = Boolean.valueOf(args[index++]); int outputRecordLimit = Integer.valueOf(args[index++]); String taxonIdsStr = args[index++]; @@ -423,6 +424,7 @@ public static void main(String[] args) { taxonIds.add(new NcbiTaxonomyID(id)); } } + boolean cleanSourceFiles = Boolean.valueOf(args[index++]); try { From cfe69cc22091c49e14350a0a0d13a2633fba9dd3 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 11:04:31 -0700 Subject: [PATCH 24/36] Added handling for multiple entries in Entrez Id field --- .../pharmgkb/PharmGkbGeneFileParser.java | 21 ++++++++++- .../pharmgkb/PharmGkbGeneFileRecord.java | 37 ++++++++++--------- .../pharmgkb/PharmGkbGeneFileParserTest.java | 4 +- 3 files changed, 40 insertions(+), 22 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java index 98d0afb..fd351f1 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java @@ -37,7 +37,9 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.List; +import java.util.Set; import org.apache.commons.lang.StringUtils; import org.apache.log4j.Logger; @@ -165,7 +167,7 @@ protected String getExpectedFileHeader() throws IOException { protected PharmGkbGeneFileRecord parseRecordFromLine(Line line) { String[] toks = line.getText().split(RegExPatterns.TAB, -1); PharmGkbID pharmGkbAccessionId = new PharmGkbID(toks[0]); - EntrezGeneID entrezGeneId = StringUtils.isNotBlank(toks[1]) ? new EntrezGeneID(toks[1]) : null; + Set entrezGeneIds = getEntrezGeneIDs(toks[1]); EnsemblGeneID ensemblGeneId = StringUtils.isNotBlank(toks[2]) ? new EnsemblGeneID(toks[2]) : null; String name = StringUtils.isNotBlank(toks[3]) ? new String(toks[3]) : null; String symbol = StringUtils.isNotBlank(toks[4]) ? new String(toks[4]) : null; @@ -202,12 +204,27 @@ protected PharmGkbGeneFileRecord parseRecordFromLine(Line line) { Integer chromosomeStart = (toks[12].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[12]); Integer chromosomeEnd = (toks[13].equalsIgnoreCase("null")) ? null : Integer.parseInt(toks[13]); - return new PharmGkbGeneFileRecord(pharmGkbAccessionId, entrezGeneId, ensemblGeneId, name, symbol, + return new PharmGkbGeneFileRecord(pharmGkbAccessionId, entrezGeneIds, ensemblGeneId, name, symbol, alternativeNames, alternativeSymbols, isVip, hasVariantAnnotation, crossReferences, hasCpicDosingGuideline, chromosome, chromosomeStart, chromosomeEnd, line.getByteOffset(), line.getLineNumber()); } + private Set getEntrezGeneIDs(String idStr) { + Set ids = new HashSet(); + if (StringUtils.isNotBlank(idStr)) { + if (idStr.contains(",")) { + idStr = idStr.replaceAll("\"", ""); + for (String tok : idStr.split(",")) { + ids.add(new EntrezGeneID(tok)); + } + } else { + ids.add(new EntrezGeneID(idStr)); + } + } + return ids; + } + /** * @param refStr * @return diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileRecord.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileRecord.java index a647ea9..42ffb26 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileRecord.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileRecord.java @@ -34,21 +34,22 @@ */ -import java.util.Collection; - -import lombok.Data; - -import org.apache.log4j.Logger; - -import edu.ucdenver.ccp.datasource.fileparsers.License; -import edu.ucdenver.ccp.datasource.fileparsers.Record; -import edu.ucdenver.ccp.datasource.fileparsers.RecordField; -import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecord; -import edu.ucdenver.ccp.datasource.identifiers.DataSource; -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; -import edu.ucdenver.ccp.datasource.identifiers.ensembl.EnsemblGeneID; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; -import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; +import java.util.Collection; +import java.util.Set; + +import lombok.Data; + +import org.apache.log4j.Logger; + +import edu.ucdenver.ccp.datasource.fileparsers.License; +import edu.ucdenver.ccp.datasource.fileparsers.Record; +import edu.ucdenver.ccp.datasource.fileparsers.RecordField; +import edu.ucdenver.ccp.datasource.fileparsers.SingleLineFileRecord; +import edu.ucdenver.ccp.datasource.identifiers.DataSource; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ensembl.EnsemblGeneID; +import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; +import edu.ucdenver.ccp.datasource.identifiers.pharmgkb.PharmGkbID; /** * File record capturing single line record from PharmGKB's genes.tsv file. @@ -65,7 +66,7 @@ public class PharmGkbGeneFileRecord extends SingleLineFileRecord { @RecordField private final PharmGkbID accessionId; @RecordField - private final EntrezGeneID entrezGeneId; + private final Set entrezGeneIds; @RecordField private final EnsemblGeneID ensemblGeneId; @RecordField @@ -105,14 +106,14 @@ public class PharmGkbGeneFileRecord extends SingleLineFileRecord { * @param hasVariantAnnotation * @param crossReferences */ - public PharmGkbGeneFileRecord(PharmGkbID accessionId, EntrezGeneID entrezGeneId, EnsemblGeneID ensemblGeneId, + public PharmGkbGeneFileRecord(PharmGkbID accessionId, Set entrezGeneIds, EnsemblGeneID ensemblGeneId, String name, String symbol, Collection alternativeNames, Collection alternativeSymbols, boolean isVip, boolean hasVariantAnnotation, Collection> crossReferences, boolean hasCpicDosingGuideline, String chromosome, Integer chromosomalStart, Integer chromosomalEnd, long byteOffset, long lineNumber) { super(byteOffset, lineNumber); this.accessionId = accessionId; - this.entrezGeneId = entrezGeneId; + this.entrezGeneIds = entrezGeneIds; this.ensemblGeneId = ensemblGeneId; this.name = name; this.symbol = symbol; diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParserTest.java index f7ed3a8..f8ff912 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParserTest.java @@ -88,7 +88,7 @@ public void testParser() throws IOException { RecordReader reader = initSampleRecordReader(); PharmGkbGeneFileRecord r = reader.next(); assertEquals("PA100", r.getAccessionId().getDataElement()); - assertEquals(995, r.getEntrezGeneId().getDataElement().intValue()); + assertEquals(995, r.getEntrezGeneIds().iterator().next().getDataElement().intValue()); assertEquals("ENSG00000158402", r.getEnsemblGeneId().getDataElement()); assertEquals("cell division cycle 25 homolog C (S. pombe)", r.getName()); assertEquals("CDC25C", r.getSymbol()); @@ -159,7 +159,7 @@ public void testParser() throws IOException { r = reader.next(); assertEquals("PA101", r.getAccessionId().getDataElement()); - assertEquals(1017, r.getEntrezGeneId().getDataElement().intValue()); + assertEquals(1017, r.getEntrezGeneIds().iterator().next().getDataElement().intValue()); assertFalse(reader.hasNext()); } From 27e666023301dcedb6692352f43bb0650dd6ce8a Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 11:59:35 -0700 Subject: [PATCH 25/36] Removed the IRefWeb host This class should probably be deprecated --- .../ucdenver/ccp/datasource/fileparsers/download/FtpHost.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java index 12fa1ee..ee096e0 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/download/FtpHost.java @@ -80,7 +80,5 @@ private FtpHost() { public static final String KEGG_GENEMAPTAB_PATH = "pub/kegg/pathway/organisms"; public static final String MGI_REPORTS_PATH = "pub/reports"; - - public static final String IREFWEB_HOST = "ftp.no.embnet.org"; } From 7bf4214993823c287c400ba13a27bb552f583d05 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 12:00:56 -0700 Subject: [PATCH 26/36] Added argument for identifier resolution to improve error messages --- .../dip/DipYYYYMMDDFileParser.java | 41 +++++----- .../drugbank/DrugBankDrugRecord.java | 21 ++--- .../EmblSequenceDatabaseFileParserBase.java | 2 +- .../GeneticAssociationDbAllTxtFileData.java | 2 +- .../hgnc/HgncDownloadFileParser.java | 4 +- .../hprd/HprdIdMappingsTxtFileParser.java | 4 +- .../irefweb/IRefWebInteraction.java | 3 +- .../irefweb/IRefWebPsiMitab2_6FileParser.java | 77 +++++++++---------- .../mgi/MRKSequenceFileParser.java | 7 +- .../gene/EntrezGene2AccessionFileData.java | 35 +++++---- .../pharmgkb/PharmGkbGeneFileParser.java | 8 +- .../rgd/RgdAnnotationFileIdResolver.java | 4 + .../hgnc/HgncDownloadFileParserTest.java | 4 +- .../IRefWebPsiMitab2_6FileParserTest.java | 4 +- .../mgi/MRKSequenceFileParserTest.java | 16 ++-- .../NucleotideAccessionResolver.java | 17 +++- .../identifiers/ProteinAccessionResolver.java | 20 ++++- .../NucleotideAccessionResolverTest.java | 6 +- .../ProteinAccessionResolverTest.java | 12 +-- 19 files changed, 153 insertions(+), 134 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java index 96e15e4..16c5a43 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java @@ -68,6 +68,7 @@ import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair; import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader; import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; +import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.ProteinAccessionResolver; import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractionID; import edu.ucdenver.ccp.datasource.identifiers.dip.DipInteractorID; @@ -77,8 +78,8 @@ import edu.ucdenver.ccp.identifier.publication.PubMedID; /** - * This class is used to parse DIPYYYMMDD files which can be downloaded from the DIP website: - * http://dip.doe-mbi.ucla.edu/dip/Main.cgi + * This class is used to parse DIPYYYMMDD files which can be downloaded from the + * DIP website: http://dip.doe-mbi.ucla.edu/dip/Main.cgi * * @author Bill Baumgartner * @@ -197,7 +198,8 @@ private Set getInteractionExperiments(String detection DipInteractionType interactionType = MiOntologyIdTermPair.parseString(DipInteractionType.class, interactionTypes[i]); DipProcessingStatus processingStatus = getDipProcessingStatus(processingStatuses[i], line); - String firstAuthorName = null; // change if the first author column ever contains names + String firstAuthorName = null; // change if the first author column + // ever contains names DipPublication publication = getDipPublication(firstAuthorName, pmids[i * 2], pmids[i * 2 + 1]); experiments.add(new DipInteractionExperiment(publication, processingStatus, detectionMethod, @@ -212,9 +214,9 @@ private Set getInteractionExperiments(String detection * @param string * @param string2 * @param pmids - * @return {@link DipPublication} from first author name and conversions of strings like - * "pubmed:9194558" and "pubmed:DIP-209S" into a {@link PubMedID} and a - * {@link DipPublicationId} + * @return {@link DipPublication} from first author name and conversions of + * strings like "pubmed:9194558" and "pubmed:DIP-209S" into a + * {@link PubMedID} and a {@link DipPublicationId} */ private DipPublication getDipPublication(String firstAuthorName, String pmidStr, String dipPubIdStr) { PubMedID pmid; @@ -230,7 +232,8 @@ private DipPublication getDipPublication(String firstAuthorName, String pmidStr, /** * @param string - * @return {@link DipProcessingStatus} parsed from a string such as: "dip:0002(small scale)" + * @return {@link DipProcessingStatus} parsed from a string such as: + * "dip:0002(small scale)" */ private DipProcessingStatus getDipProcessingStatus(String statusStr, String line) { Pattern p = Pattern.compile("(dip:\\d+)\\((.*?)\\)"); @@ -258,8 +261,9 @@ private DipInteractor getInteractor(String interactorStr, String alternateIdsStr } /* - * The columns for alternate IDs and aliases are always set to "-". If this is no - * longer the case then an exception will be thrown and code changes required. + * The columns for alternate IDs and aliases are always set to + * "-". If this is no longer the case then an exception will be + * thrown and code changes required. */ Set alternateIds = null; if (!alternateIdsStr.trim().equals("-")) { @@ -318,13 +322,7 @@ private DataSourceIdentifier resolveId(String idStr) { return new DipInteractorID(idStr); } if (idStr.startsWith("refseq:")) { - try { - return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:")); - // return new RefSeqID(StringUtil.removePrefix(idStr, "refseq:")); - } catch (IllegalArgumentException e) { - logger.warn("Invalid RefSeq identifier detected: " + idStr); - return null; - } + return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "refseq:"), idStr); } if (idStr.startsWith("uniprotkb:")) { if (idStr.contains(StringConstants.HYPHEN_MINUS)) { @@ -333,8 +331,7 @@ private DataSourceIdentifier resolveId(String idStr) { try { return new UniProtID(StringUtil.removePrefix(idStr, "uniprotkb:")); } catch (IllegalArgumentException e) { - logger.warn("Invalid UniProt identifier detected: " + idStr); - return null; + return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } } throw new IllegalArgumentException("Unhandled identifier type: " + idStr); @@ -347,16 +344,16 @@ private DataSourceIdentifier resolveId(String idStr) { // * MI id // * @return id if recognized; otherwise, null // */ - // private static MolecularInteractionOntologyTermID extractMiId(String inputStr) { + // private static MolecularInteractionOntologyTermID extractMiId(String + // inputStr) { // Pattern methodIDPattern = Pattern.compile("(MI:\\d+),?\\("); // Matcher m = methodIDPattern.matcher(inputStr); // if (m.find()) { // return new MolecularInteractionOntologyTermID(m.group(1)); // } - // logger.error("Unable to locate ExperimentalMethod MI ID in String: " + inputStr); + // logger.error("Unable to locate ExperimentalMethod MI ID in String: " + + // inputStr); // return null; // } - - } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java index e98a736..9c0498b 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java @@ -160,7 +160,6 @@ import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggCompoundID; import edu.ucdenver.ccp.datasource.identifiers.kegg.KeggDrugID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.MeshID; -import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.GiNumberID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.snp.SnpRsId; import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID; import edu.ucdenver.ccp.datasource.identifiers.obo.ChebiOntologyID; @@ -1299,22 +1298,16 @@ private static DataSourceIdentifier resolveIdentifier(String resource, String } else if (resource.equals("GeneCards")) { return new GeneCardId(identifier); } else if (resource.equals("GenBank Gene Database")) { - return NucleotideAccessionResolver.resolveNucleotideAccession(identifier); + return NucleotideAccessionResolver.resolveNucleotideAccession(identifier, "GenBank Gene Database:" + + identifier); } else if (resource.equals("GenBank Protein Database")) { - try { - return ProteinAccessionResolver.resolveProteinAccession(identifier); - } catch (IllegalArgumentException e) { - if (identifier.matches("\\d+")) { - return new GiNumberID(identifier); - } else { - return new ProbableErrorDataSourceIdentifier("identifier", "GenBank", - "Observed invalid GenBank protein identifier: " + identifier); - } - } + return ProteinAccessionResolver + .resolveProteinAccession(identifier, "GenBank Protein Database" + identifier); } else if (resource.equals("GenBank")) { - DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier); + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(identifier, + "GenBank:" + identifier); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) { - return ProteinAccessionResolver.resolveProteinAccession(identifier); + return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier); } } else if (resource.equals("UniProtKB")) { return new UniProtID(identifier); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java index 439c02e..e8d3c21 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/embl/EmblSequenceDatabaseFileParserBase.java @@ -500,7 +500,7 @@ protected abstract T invokeConstructor(E idLineContents, List accessionNumber private EmblAssemblyInformation parseASLine(String line) { String[] toks = line.split("\\s+"); String localSpan = toks[1]; - DataSourceIdentifier primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2]); + DataSourceIdentifier primaryIdentifier = NucleotideAccessionResolver.resolveNucleotideAccession(toks[2], toks[2]); String primarySpan = toks[3]; boolean originatesFromComplementary = (toks.length == 5 && toks[4].trim().equalsIgnoreCase("c")) ? true : false; return new EmblAssemblyInformation(localSpan, primaryIdentifier, primarySpan, originatesFromComplementary); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java index 33ff25a..3b8e4bf 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/gad/GeneticAssociationDbAllTxtFileData.java @@ -473,7 +473,7 @@ public static GeneticAssociationDbAllTxtFileData parseGeneticAssociationDbAllTxt if (acc.matches("\\d+")) { nucleotideId = new GiNumberID(acc); } else { - nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + nucleotideId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, refseqURL); } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java index 7497744..753a6ae 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java @@ -564,9 +564,9 @@ private Set> resolveAccessionNumbers(String accListStr) Set> accNumbers = new HashSet>(); if (!accListStr.isEmpty()) { for (String acc : accListStr.split(",")) { - DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { - DataSourceIdentifier proAccId = ProteinAccessionResolver.resolveProteinAccession(acc); + DataSourceIdentifier proAccId = ProteinAccessionResolver.resolveProteinAccession(acc, acc); accNumbers.add(proAccId); } else { accNumbers.add(nucAccId); diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java index a5417ea..278b79c 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hprd/HprdIdMappingsTxtFileParser.java @@ -115,9 +115,9 @@ protected HprdIdMappingsTxtFileData parseRecordFromLine(Line line) { } private DataSourceIdentifier resolveAccession(String acc) { - DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { - return ProteinAccessionResolver.resolveProteinAccession(acc); + return ProteinAccessionResolver.resolveProteinAccession(acc, acc); } return nucAccId; } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java index c0be435..a7d31ed 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebInteraction.java @@ -46,7 +46,6 @@ import edu.ucdenver.ccp.datasource.identifiers.irefweb.IrigId; import edu.ucdenver.ccp.datasource.identifiers.irefweb.RigId; import edu.ucdenver.ccp.datasource.identifiers.other.ImexId; -import edu.ucdenver.ccp.identifier.publication.PubMedID; @Data @Record(dataSource = DataSource.IREFWEB, label="interaction") @@ -57,7 +56,7 @@ public class IRefWebInteraction implements DataRecord { @RecordField(comment = "Notes: According to MITAB2.6 format this column should contain a pipe-delimited list of author surnames in which the interaction has been shown.\nThis column will usually include only one author name reference. However, some experimental evidences have secondary references which could be included here. This filed also includes references which are not author names as in the following examples:\nOPHID Predicted Protein Interaction\nHPRD Text Mining Confirmation\nMINT Text Mining Confirmation") private final String author; @RecordField(comment = "Notes: This is a non-redundant list of PubMed identifiers pointing to literature that supports the interaction. According to MITAB2.6 format, this column should contain a pipe-delimited set of databaseName:identifier pairs such as pubmed:12345. The source database name is always pubmed.") - private final Set pmids; + private final Set> pmids; @RecordField private final IRefWebInteractionType interactionType; @RecordField(comment = "source interaction-database and accessions.\nExample: intact:EBI-761694|rigid:3ERiFkUFsm7ZUHIRJTx8ZlHILRA|irigid:1234|edgetype:X\nNotes: Each reference is presented as a database name:identifier pair.\nChange: The source database is listed first. Additional information is pipe-delimited and presented here for the convenience of PSICQUIC web-service users (these services presently truncate this file at column 15 as they only support MITAB2.5). See columns 35,45,53.\nThe source database names that appear in this column are taken from the PSI-MI controlled vocabulary at the following location (where possible): http://www.ebi.ac.uk/ontology-lookup/browse.do?ontName=MI\nIf an interaction record identifier is not provided by the source database, this entry will appear as database-name:- with the identifier region replaced with a dash (-).") diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java index 2c3a615..4643a13 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java @@ -56,18 +56,15 @@ import java.util.HashSet; import java.util.Set; -import org.apache.log4j.BasicConfigurator; import org.apache.log4j.Logger; -import edu.ucdenver.ccp.common.download.FtpDownload; +import edu.ucdenver.ccp.common.download.HttpDownload; import edu.ucdenver.ccp.common.file.CharacterEncoding; import edu.ucdenver.ccp.common.file.reader.Line; import edu.ucdenver.ccp.common.file.reader.StreamLineReader; -import edu.ucdenver.ccp.common.ftp.FTPUtil.FileType; import edu.ucdenver.ccp.common.string.RegExPatterns; import edu.ucdenver.ccp.common.string.StringConstants; import edu.ucdenver.ccp.common.string.StringUtil; -import edu.ucdenver.ccp.datasource.fileparsers.download.FtpHost; import edu.ucdenver.ccp.datasource.fileparsers.obo.MiOntologyIdTermPair; import edu.ucdenver.ccp.datasource.fileparsers.obo.NcbiTaxonomyIdTermPair; import edu.ucdenver.ccp.datasource.fileparsers.taxonaware.TaxonAwareSingleLineFileRecordReader; @@ -137,12 +134,9 @@ public class IRefWebPsiMitab2_6FileParser extends TaxonAwareSingleLineFileRecord private static final String HEADER = "#uidA\tuidB\taltA\taltB\taliasA\taliasB\tmethod\tauthor\tpmids\ttaxa\ttaxb\tinteractionType\tsourcedb\tinteractionIdentifier\tconfidence\texpansion\tbiological_role_A\tbiological_role_B\texperimental_role_A\texperimental_role_B\tinteractor_type_A\tinteractor_type_B\txrefs_A\txrefs_B\txrefs_Interaction\tAnnotations_A\tAnnotations_B\tAnnotations_Interaction\tHost_organism_taxid\tparameters_Interaction\tCreation_date\tUpdate_date\tChecksum_A\tChecksum_B\tChecksum_Interaction\tNegative\tOriginalReferenceA\tOriginalReferenceB\tFinalReferenceA\tFinalReferenceB\tMappingScoreA\tMappingScoreB\tirogida\tirogidb\tirigid\tcrogida\tcrogidb\tcrigid\ticrogida\ticrogidb\ticrigid\timex_id\tedgetype\tnumParticipants"; - // public static final String FTP_FILE_NAME = "All.mitab.03022013.txt.zip"; - public static final String FTP_FILE_NAME = "All.mitab.07042015.txt.zip"; public static final CharacterEncoding ENCODING = CharacterEncoding.US_ASCII; - public static final String FTP_USER_NAME = "ftp"; - @FtpDownload(server = FtpHost.IREFWEB_HOST, path = "irefindex/data/archive/release_10.0/psi_mitab/MITAB2.6/", filename = FTP_FILE_NAME, filetype = FileType.BINARY, username = FTP_USER_NAME, decompress = true, targetFileName = "All.mitab.04072015.txt") + @HttpDownload(url = "http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/All.mitab.07042015.txt.zip", decompress = true, targetFileName = "All.mitab.04072015.txt") private File allMitabTxtFile; public IRefWebPsiMitab2_6FileParser(File file, CharacterEncoding encoding) throws IOException, @@ -254,7 +248,7 @@ private IRefWebInteraction getInteraction(String detectionMethodStr, String auth detectionMethodStr); } String author = (authorStr.trim().equals(StringConstants.HYPHEN_MINUS)) ? null : authorStr; - Set pmids = parsePmidsStr(pmidsStr); + Set> pmids = parsePmidsStr(pmidsStr); IRefWebInteractionType interactionType = null; if (!interactionTypeStr.trim().equals(StringConstants.HYPHEN_MINUS)) { interactionType = MiOntologyIdTermPair.parseString(IRefWebInteractionType.class, interactionTypeStr); @@ -404,9 +398,9 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { } else if (idStr.startsWith("icrogid:")) { return new IcrogId(StringUtil.removePrefix(idStr, "icrogid:")); } else if (idStr.startsWith("refseq:")) { - return getRefseqAccession(StringUtil.removePrefix(idStr, "refseq:").toUpperCase()); + return getRefseqAccession(StringUtil.removePrefix(idStr, "refseq:").toUpperCase(), idStr); } else if (idStr.startsWith("RefSeq:")) { - return getRefseqAccession(StringUtil.removePrefix(idStr, "RefSeq:").toUpperCase()); + return getRefseqAccession(StringUtil.removePrefix(idStr, "RefSeq:").toUpperCase(), idStr); } else if (idStr.startsWith("rogid:")) { return new RogId(StringUtil.removePrefix(idStr, "rogid:")); } else if (idStr.startsWith("irogid:")) { @@ -464,21 +458,21 @@ private DataSourceIdentifier resolveInteractorId(String idStr) { } else if (idStr.startsWith("InnateDB:")) { return new InnateDbId(StringUtil.removePrefix(idStr, "InnateDB:")); } else if (idStr.startsWith("emb:")) { - return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "emb:")); + return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(idStr, "emb:"), idStr); } else if (idStr.startsWith("dbj:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "dbj:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "dbj:"), idStr); } else if (idStr.startsWith("ddbj/embl/genbank:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "ddbj/embl/genbank:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "ddbj/embl/genbank:"), idStr); } else if (idStr.startsWith("GenBank:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "GenBank:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "GenBank:"), idStr); } else if (idStr.startsWith("genbank indentifier:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "genbank indentifier:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "genbank indentifier:"), idStr); } else if (idStr.startsWith("GB:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "GB:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "GB:"), idStr); } else if (idStr.startsWith("gb:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "gb:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "gb:"), idStr); } else if (idStr.startsWith("tpg:")) { - return getGenbankAccession(StringUtil.removePrefix(idStr, "tpg:")); + return getGenbankAccession(StringUtil.removePrefix(idStr, "tpg:"), idStr); } else if (idStr.startsWith("pdb:")) { return new PdbID(StringUtil.removePrefix(idStr, "pdb:")); } else if (idStr.startsWith("flybase:")) { @@ -508,16 +502,15 @@ private DataSourceIdentifier getUniprotId(String idStr) { } return new UniProtID(idStr); } catch (IllegalArgumentException e) { - logger.warn("Detected invalid UniProt accession: " + idStr); - return null; + return new ProbableErrorDataSourceIdentifier(idStr, null, e.getMessage()); } } - private DataSourceIdentifier getRefseqAccession(String acc) { + private DataSourceIdentifier getRefseqAccession(String acc, String accWithPrefix) { try { return new RefSeqID(acc); } catch (IllegalArgumentException e) { - return getGenbankAccession(acc); + return getGenbankAccession(acc, accWithPrefix); } } @@ -525,10 +518,11 @@ private DataSourceIdentifier getRefseqAccession(String acc) { * @param removePrefix * @return */ - private DataSourceIdentifier getGenbankAccession(String acc) { - DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc); + private DataSourceIdentifier getGenbankAccession(String acc, String accWithPrefix) { + DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, + accWithPrefix); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { - return ProteinAccessionResolver.resolveProteinAccession(acc); + return ProteinAccessionResolver.resolveProteinAccession(acc, accWithPrefix); } else { return nucAccId; } @@ -538,17 +532,17 @@ private DataSourceIdentifier getGenbankAccession(String acc) { * @param pmidsStr * @return */ - private Set parsePmidsStr(String pmidsStr) { + private Set> parsePmidsStr(String pmidsStr) { if (pmidsStr.trim().equals(StringConstants.HYPHEN_MINUS) || pmidsStr.trim().equals("pubmed:0")) { return null; } String[] toks = pmidsStr.split(RegExPatterns.PIPE); - Set pmids = new HashSet(); + Set> pmids = new HashSet>(); for (String tok : toks) { try { pmids.add(new PubMedID(StringUtil.removePrefix(tok, "pubmed:"))); } catch (IllegalArgumentException e) { - logger.warn("Detected invalid pubmed id: " + e.getMessage()); + pmids.add(new ProbableErrorDataSourceIdentifier(tok, null, e.getMessage())); } } return pmids; @@ -630,7 +624,7 @@ private IRefWebInteractor getInteractor(String uniqueIdStr, String altIdStr, Str private Set resolveAliasSymbols(String aliasStr) { Set aliases = new HashSet(); for (String alias : aliasStr.split(RegExPatterns.PIPE)) { - String aliasSymbol = alias;//resolveAliasSymbol(alias); + String aliasSymbol = alias;// resolveAliasSymbol(alias); if (aliasSymbol != null && !aliasSymbol.equals("-")) { aliases.add(aliasSymbol); } @@ -638,16 +632,17 @@ private Set resolveAliasSymbols(String aliasStr) { return aliases; } -// /** -// * @param alias -// * @return -// */ -// private String resolveAliasSymbol(String aliasStr) { -// if (aliasStr.startsWith("entrezgene/locuslink:")) { -// return new String(StringUtil.removePrefix(aliasStr, "entrezgene/locuslink:")); -// } -// return aliasStr; -// } + // /** + // * @param alias + // * @return + // */ + // private String resolveAliasSymbol(String aliasStr) { + // if (aliasStr.startsWith("entrezgene/locuslink:")) { + // return new String(StringUtil.removePrefix(aliasStr, + // "entrezgene/locuslink:")); + // } + // return aliasStr; + // } /** * @param aliasStr @@ -684,7 +679,7 @@ private DataSourceIdentifier resolveAliasId(String aliasStr) { } else if (aliasStr.startsWith("rogid:")) { return new RogId(StringUtil.removePrefix(aliasStr, "rogid:")); } else if (aliasStr.startsWith("refseq:")) { - return getRefseqAccession(StringUtil.removePrefix(aliasStr, "refseq:")); + return getRefseqAccession(StringUtil.removePrefix(aliasStr, "refseq:"), aliasStr); } else if (aliasStr.startsWith("hgnc:")) { return new HgncGeneSymbolID(StringUtil.removePrefix(aliasStr, "hgnc:")); } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java index a095524..41d49c8 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java @@ -142,7 +142,7 @@ protected MRKSequenceFileData parseRecordFromLine(Line line) { for (String genBankID : genBankIDs) { if (genBankID.trim().length() > 0) { DataSourceIdentifier resolveNucleotideAccession = NucleotideAccessionResolver - .resolveNucleotideAccession(genBankID); + .resolveNucleotideAccession(genBankID, genBankID); genBankAccessionIDs.add(resolveNucleotideAccession); } } @@ -233,13 +233,14 @@ protected MRKSequenceFileData parseRecordFromLine(Line line) { } } } - + String featureType = toks[20]; return new MRKSequenceFileData(mgiAccessionID, markerSymbol, status, markerType, markerName, cM_Position, chromosome, genomeCoordinateStart, genomeCoordinateEnd, strand, genBankAccessionIDs, refseqTranscriptIds, vegaTranscriptIds, ensemblTranscriptIds, uniprotIds, tremblIds, vegaProteinIds, - ensemblProteinIds, refseqProteinIds, unigeneIds, featureType, line.getByteOffset(), line.getLineNumber()); + ensemblProteinIds, refseqProteinIds, unigeneIds, featureType, line.getByteOffset(), + line.getLineNumber()); } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2AccessionFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2AccessionFileData.java index 65d918b..8736a73 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2AccessionFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2AccessionFileData.java @@ -60,19 +60,22 @@ @Record(dataSource = DataSource.EG, comment = "", license = License.NCBI, citation = "The NCBI handbook [Internet]. Bethesda (MD): National Library of Medicine (US), National Center for Biotechnology Information; 2002 Oct. Chapter 19 Gene: A Directory of Genes. Available from http://www.ncbi.nlm.nih.gov/books/NBK21091", label = "gene2accession record") public class EntrezGene2AccessionFileData extends SingleLineFileRecord { /* - * #Format: tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi - * protein_accession.version protein_gi genomic_nucleotide_accession.version - * genomic_nucleotide_gi start_position_on_the_genomic_accession - * end_position_on_the_genomic_accession orientation assembly (tab is used as a separator, pound - * sign - start of a comment) + * #Format: tax_id GeneID status RNA_nucleotide_accession.version + * RNA_nucleotide_gi protein_accession.version protein_gi + * genomic_nucleotide_accession.version genomic_nucleotide_gi + * start_position_on_the_genomic_accession + * end_position_on_the_genomic_accession orientation assembly (tab is used + * as a separator, pound sign - start of a comment) */ /* - * #Format: tax_id GeneID status RNA_nucleotide_accession.version RNA_nucleotide_gi - * protein_accession.version protein_gi genomic_nucleotide_accession.version - * genomic_nucleotide_gi start_position_on_the_genomic_accession - * end_position_on_the_genomic_accession orientation assembly mature_peptide_accession.version - * mature_peptide_gi Symbol (tab is used as a separator, pound sign - start of a comment) + * #Format: tax_id GeneID status RNA_nucleotide_accession.version + * RNA_nucleotide_gi protein_accession.version protein_gi + * genomic_nucleotide_accession.version genomic_nucleotide_gi + * start_position_on_the_genomic_accession + * end_position_on_the_genomic_accession orientation assembly + * mature_peptide_accession.version mature_peptide_gi Symbol (tab is used as + * a separator, pound sign - start of a comment) */ @RecordField(comment = "the unique identifier provided by NCBI Taxonomy for the species or strain/isolate") @@ -196,7 +199,8 @@ public static EntrezGene2AccessionFileData parseGene2AccessionLine(Line line) { DataSourceIdentifier RNA_nucleotide_accession_dot_version = null; if (!toks[3].equals("-") && status != null) { - RNA_nucleotide_accession_dot_version = NucleotideAccessionResolver.resolveNucleotideAccession(toks[3]); + RNA_nucleotide_accession_dot_version = NucleotideAccessionResolver.resolveNucleotideAccession(toks[3], + toks[3]); } String intStr = toks[4]; @@ -207,7 +211,7 @@ public static EntrezGene2AccessionFileData parseGene2AccessionLine(Line line) { DataSourceIdentifier protein_accession_dot_version = null; if (!toks[5].equals("-") && status != null) { - protein_accession_dot_version = ProteinAccessionResolver.resolveProteinAccession(toks[5]); + protein_accession_dot_version = ProteinAccessionResolver.resolveProteinAccession(toks[5], toks[5]); } intStr = toks[6]; @@ -218,8 +222,8 @@ public static EntrezGene2AccessionFileData parseGene2AccessionLine(Line line) { DataSourceIdentifier genomic_nucleotide_accession_dot_version = null; if (!toks[7].equals("-") && status != null) { - genomic_nucleotide_accession_dot_version = NucleotideAccessionResolver - .resolveNucleotideAccession(toks[7]); + genomic_nucleotide_accession_dot_version = NucleotideAccessionResolver.resolveNucleotideAccession( + toks[7], toks[7]); } intStr = toks[8]; @@ -257,7 +261,8 @@ public static EntrezGene2AccessionFileData parseGene2AccessionLine(Line line) { DataSourceIdentifier mature_peptide_accession_dot_version = null; if (!toks[13].equals("-")) { - mature_peptide_accession_dot_version = ProteinAccessionResolver.resolveProteinAccession(toks[13]); + mature_peptide_accession_dot_version = ProteinAccessionResolver.resolveProteinAccession(toks[13], + toks[13]); } intStr = toks[14]; diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java index fd351f1..23327d1 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java @@ -260,13 +260,13 @@ private DataSourceIdentifier resolveCrossRefId(String refStr) { } else if (refStr.startsWith(OMIM_PREFIX)) { return new OmimID(StringUtil.removePrefix(refStr, OMIM_PREFIX)); } else if (refStr.startsWith(REFSEQDNA_PREFIX)) { - return NucleotideAccessionResolver.resolveNucleotideAccession(StringUtil.removePrefix(refStr, - REFSEQDNA_PREFIX)); + return NucleotideAccessionResolver.resolveNucleotideAccession( + StringUtil.removePrefix(refStr, REFSEQDNA_PREFIX), refStr); } else if (refStr.startsWith(REFSEQRNA_PREFIX)) { return new RefSeqID(StringUtil.removePrefix(refStr, REFSEQRNA_PREFIX)); } else if (refStr.startsWith(REFSEQPROTEIN_PREFIX)) { - return ProteinAccessionResolver.resolveProteinAccession(StringUtil.removePrefix(refStr, - REFSEQPROTEIN_PREFIX)); + return ProteinAccessionResolver.resolveProteinAccession( + StringUtil.removePrefix(refStr, REFSEQPROTEIN_PREFIX), refStr); } else if (refStr.startsWith(UCSCGENOMEBROWSER_PREFIX)) { return new UcscGenomeBrowserId(StringUtil.removePrefix(refStr, UCSCGENOMEBROWSER_PREFIX)); } else if (refStr.startsWith(UNIPROT_PREFIX)) { diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java index 2cd990f..8ce4c2f 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java @@ -80,6 +80,10 @@ public DataSourceIdentifier resolveId(String idStr) { // there is one instance of RGD:[space]737465 return new RgdID(idStr.substring(idStr.lastIndexOf(" "))); } + if (idStr.matches("RGDG:\\d+")) { + // there is one instance of RGDG: + return new RgdID(idStr.substring(4)); + } if (idStr.matches("RDG:\\d+")) { // there are a few typos where RDG appears instead of RGD return new RgdID(idStr.substring(4)); diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java index f574e71..e96d636 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java @@ -108,7 +108,7 @@ public void testParser() throws Exception { assertEquals("2012-10-12", dataRecord.getDateModified()); assertEquals("2010-11-25", dataRecord.getDateSymbolChanged()); assertEquals("2012-08-15", dataRecord.getDateNameChanged()); - assertEquals(CollectionsUtil.createSet(NucleotideAccessionResolver.resolveNucleotideAccession("BC040926")), + assertEquals(CollectionsUtil.createSet(NucleotideAccessionResolver.resolveNucleotideAccession("BC040926", null)), dataRecord.getAccessionNumbers()); assertEmpty(dataRecord.getEcNumbers()); assertEquals(new EntrezGeneID(503538), dataRecord.getEntrezGeneID()); @@ -158,7 +158,7 @@ public void testParser() throws Exception { assertEquals("2011-07-21", dataRecord.getDateModified()); assertNull(dataRecord.getDateSymbolChanged()); assertNull(dataRecord.getDateNameChanged()); - assertEquals(CollectionsUtil.createSet(NucleotideAccessionResolver.resolveNucleotideAccession("AF271790")), + assertEquals(CollectionsUtil.createSet(NucleotideAccessionResolver.resolveNucleotideAccession("AF271790", null)), dataRecord.getAccessionNumbers()); assertEmpty(dataRecord.getEcNumbers()); assertEquals(new EntrezGeneID(29974), dataRecord.getEntrezGeneID()); diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java index 2994431..f856a86 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParserTest.java @@ -250,8 +250,8 @@ public void testParser() throws IOException { assertFalse(record.getInteraction().isNegative()); - assertEquals(new ProbableErrorDataSourceIdentifier("\"1FMO_I\"", null, - "Input is not a known protein accession pattern: \"1FMO_I\""), record.getInteractorA() + assertEquals(new ProbableErrorDataSourceIdentifier("GenBank:\"1FMO_I\"", null, + "Input is not a known accession pattern: GenBank:\"1FMO_I\""), record.getInteractorA() .getOriginalReference()); assertEquals(new RefSeqID("NP_032880"), record.getInteractorB().getOriginalReference()); assertEquals(new PdbID("1FMO_I"), record.getInteractorA().getFinalReference()); diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParserTest.java index c2783d9..35b4818 100755 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParserTest.java @@ -96,14 +96,14 @@ public void testParser() { assertEquals(MgiGeneType.GENE, record1.getMarkerType()); assertEquals(new String("RIKEN cDNA 0610007P14 gene"), record1.getMarkerName()); Set> expectedGenBankIds = new HashSet>(); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AF270646")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK002308")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK004480")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK152230")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AU019315")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("BC004591")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("BG066052")); - expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("C77855")); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AF270646", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK002308", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK004480", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AK152230", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("AU019315", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("BC004591", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("BG066052", null)); + expectedGenBankIds.add(NucleotideAccessionResolver.resolveNucleotideAccession("C77855", null)); assertEquals(expectedGenBankIds, record1.getGenBankAccessionIDs()); Set expectedRefseqTranscriptIds = new HashSet(); diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java index 52e4ec5..917ba05 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolver.java @@ -102,7 +102,16 @@ public class NucleotideAccessionResolver { } } - public static DataSourceIdentifier resolveNucleotideAccession(String acc) { + /** + * @param acc + * @param idWithPrefix + * - optional, is only used as part of the error message if the + * acc cannot be resolved. Often the prefix is stripped prior to + * id resolution, this parameter allows the prefix to be included + * in the error message. + * @return + */ + public static DataSourceIdentifier resolveNucleotideAccession(String acc, String idWithPrefix) { acc = acc.toUpperCase().trim(); if (acc.matches("[A-Z][A-Z]_\\d+\\.?\\d*")) { return new RefSeqID(acc); @@ -151,7 +160,11 @@ public static DataSourceIdentifier resolveNucleotideAccession(String acc } } // logger.warn("Input is not a known nucleotide accession: " + acc); - return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known nucleotide accession: " + acc); + if (idWithPrefix == null) { + idWithPrefix = acc; + } + return new ProbableErrorDataSourceIdentifier(idWithPrefix, null, "Input is not a known accession: " + + idWithPrefix); } } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java index 0efbf90..98e3cdd 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolver.java @@ -64,7 +64,16 @@ public class ProteinAccessionResolver { private static final String VALID_UNIPROT_PATTERN_3 = "[A-NR-Z][0-9][A-Z][A-Z0-9][A-Z0-9][0-9][A-Z][A-Z0-9][A-Z0-9][0-9]"; private static final String VALID_UNIPROT_PATTERN_2 = "[OPQ][0-9][A-Z0-9][A-Z0-9][A-Z0-9][0-9]"; - public static DataSourceIdentifier resolveProteinAccession(String acc) { + /** + * @param acc + * @param idWithPrefix + * - optional, is only used as part of the error message if the + * acc cannot be resolved. Often the prefix is stripped prior to + * id resolution, this parameter allows the prefix to be included + * in the error message. + * @return + */ + public static DataSourceIdentifier resolveProteinAccession(String acc, String idWithPrefix) { acc = acc.toUpperCase(); if (acc.matches("[A-Z][A-Z]_\\d+\\.?\\d*")) { return new RefSeqID(acc); @@ -119,9 +128,12 @@ public static DataSourceIdentifier resolveProteinAccession(String acc) { return new GenBankID(acc); } } - logger.warn("Input is not a known protein accession pattern: " + acc); - return new ProbableErrorDataSourceIdentifier(acc, null, "Input is not a known protein accession pattern: " - + acc); +// logger.warn("Input is not a known protein accession pattern: " + acc); + if (idWithPrefix == null) { + idWithPrefix = acc; + } + return new ProbableErrorDataSourceIdentifier(idWithPrefix, null, "Input is not a known accession pattern: " + + idWithPrefix); } } diff --git a/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolverTest.java b/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolverTest.java index d2d9517..b8021d8 100644 --- a/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolverTest.java +++ b/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/NucleotideAccessionResolverTest.java @@ -51,13 +51,13 @@ public class NucleotideAccessionResolverTest { @Test public void testRefseqResolution() { - assertEquals(new RefSeqID("NM_000518"), NucleotideAccessionResolver.resolveNucleotideAccession("NM_000518")); - assertEquals(new RefSeqID("NM_000518"), NucleotideAccessionResolver.resolveNucleotideAccession("NM_000518.2")); + assertEquals(new RefSeqID("NM_000518"), NucleotideAccessionResolver.resolveNucleotideAccession("NM_000518", null)); + assertEquals(new RefSeqID("NM_000518"), NucleotideAccessionResolver.resolveNucleotideAccession("NM_000518.2", null)); } @Test public void testGenbankResolution() { - assertEquals(new GenBankID("AC004528.1"), NucleotideAccessionResolver.resolveNucleotideAccession("AC004528.1")); + assertEquals(new GenBankID("AC004528.1"), NucleotideAccessionResolver.resolveNucleotideAccession("AC004528.1", null)); } } diff --git a/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolverTest.java b/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolverTest.java index edc578f..713f9d1 100644 --- a/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolverTest.java +++ b/datasource-identifiers/src/test/java/edu/ucdenver/ccp/datasource/identifiers/ProteinAccessionResolverTest.java @@ -54,12 +54,12 @@ public class ProteinAccessionResolverTest { @Test public void testProteinAccessionResolution() { - assertEquals(new GenBankID("AAI00916"), ProteinAccessionResolver.resolveProteinAccession("AAI00916")); - assertEquals(new GenBankID("AAI00916.2"), ProteinAccessionResolver.resolveProteinAccession("AAI00916.2")); - assertEquals(new EmblID("CAI00916"), ProteinAccessionResolver.resolveProteinAccession("CAI00916")); - assertEquals(new DdbjId("GAI00916"), ProteinAccessionResolver.resolveProteinAccession("GAI00916")); - assertEquals(new RefSeqID("NP_795370"), ProteinAccessionResolver.resolveProteinAccession("NP_795370")); - assertEquals(new UniProtID("P59543"), ProteinAccessionResolver.resolveProteinAccession("P59543")); + assertEquals(new GenBankID("AAI00916"), ProteinAccessionResolver.resolveProteinAccession("AAI00916", null)); + assertEquals(new GenBankID("AAI00916.2"), ProteinAccessionResolver.resolveProteinAccession("AAI00916.2", null)); + assertEquals(new EmblID("CAI00916"), ProteinAccessionResolver.resolveProteinAccession("CAI00916", null)); + assertEquals(new DdbjId("GAI00916"), ProteinAccessionResolver.resolveProteinAccession("GAI00916", null)); + assertEquals(new RefSeqID("NP_795370"), ProteinAccessionResolver.resolveProteinAccession("NP_795370", null)); + assertEquals(new UniProtID("P59543"), ProteinAccessionResolver.resolveProteinAccession("P59543", null)); } } From 53c3ac8efb178ce477c33577cacb8dd8a69e7e0e Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 12:10:25 -0700 Subject: [PATCH 27/36] added handling for an RGD id typo --- .../fileparsers/rgd/RgdAnnotationFileIdResolver.java | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java index 8ce4c2f..f477e67 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdAnnotationFileIdResolver.java @@ -81,8 +81,8 @@ public DataSourceIdentifier resolveId(String idStr) { return new RgdID(idStr.substring(idStr.lastIndexOf(" "))); } if (idStr.matches("RGDG:\\d+")) { - // there is one instance of RGDG: - return new RgdID(idStr.substring(4)); + // there is one instance of RGDG:733289 + return new RgdID(idStr.substring(5)); } if (idStr.matches("RDG:\\d+")) { // there are a few typos where RDG appears instead of RGD @@ -114,12 +114,6 @@ public DataSourceIdentifier resolveId(String idStr) { if (idStr.matches("PW:\\d+")) { return new PwId(idStr); } - if (idStr.matches("rno:\\d+")) { - logger.warn("Ignoring RNO identifier: " + idStr + ". Not sure what this references..."); - // not sure what this is.. could be a kegg gene? it's used in the - // withOrFrom column - return null; - } if (idStr.startsWith("UniProtKB:")) { return new UniProtID(idStr.substring(10)); } From 9c7b81f28833498c62935c2946f64cfffc7637bd Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 12:48:08 -0700 Subject: [PATCH 28/36] Added handling for new column (supplied vega id) --- .../hgnc/HgncDownloadFileData.java | 494 +----------------- .../hgnc/HgncDownloadFileParser.java | 28 +- .../hgnc/HgncDownloadFileParserTest.java | 10 +- .../fileparsers/hgnc/hgnc_download.txt | 6 +- 4 files changed, 47 insertions(+), 491 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileData.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileData.java index 8473722..49a216d 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileData.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileData.java @@ -35,6 +35,9 @@ import java.util.Set; +import lombok.Data; +import lombok.EqualsAndHashCode; + import org.apache.log4j.Logger; import edu.ucdenver.ccp.datasource.fileparsers.Record; @@ -153,6 +156,8 @@ * @author Center for Computational Pharmacology; ccpsupport@ucdenver.edu * */ +@Data +@EqualsAndHashCode(callSuper = false) @Record(dataSource = DataSource.HGNC, schemaVersion = "2", comment = "Previous version of this record represented only a subset of the data in the HGNC download file. This version represents all data and includes the new \"gene family description\" column.", label = "HGNC record") public class HgncDownloadFileData extends SingleLineFileRecord { @@ -262,7 +267,7 @@ public class HgncDownloadFileData extends SingleLineFileRecord { @RecordField private final EntrezGeneID suppliedEntrezGeneId; @RecordField - private final OmimID suppliedOmimId; + private final Set suppliedOmimIds; @RecordField private final RefSeqID suppliedRefseqId; @RecordField @@ -270,6 +275,8 @@ public class HgncDownloadFileData extends SingleLineFileRecord { @RecordField private final EnsemblGeneID suppliedEnsemblId; @RecordField + private final VegaID suppliedVegaId; + @RecordField private final UcscGenomeBrowserId suppliedUcscId; @RecordField private final Set suppliedMgiIds; @@ -319,10 +326,10 @@ public class HgncDownloadFileData extends SingleLineFileRecord { * @param suppliedMgiId * @param suppliedRgdId */ - public HgncDownloadFileData(HgncID hgncID, HgncGeneSymbolID hgncGeneSymbol, String hgncGeneName, - String status, String locusType, String locusGroup, Set previousSymbols, - Set previousNames, Set synonyms, Set nameSynonyms, String chromosome, - String dateApproved, String dateModified, String dateSymbolChanged, String dateNameChanged, + public HgncDownloadFileData(HgncID hgncID, HgncGeneSymbolID hgncGeneSymbol, String hgncGeneName, String status, + String locusType, String locusGroup, Set previousSymbols, Set previousNames, + Set synonyms, Set nameSynonyms, String chromosome, String dateApproved, + String dateModified, String dateSymbolChanged, String dateNameChanged, Set> accessionNumbers, Set ecNumbers, EntrezGeneID entrezGeneID, EnsemblGeneID ensemblGeneID, Set mgiIDs, Set specialistDatabaseIdLinkPairings, Set pubmedIDs, @@ -330,9 +337,10 @@ public HgncDownloadFileData(HgncID hgncID, HgncGeneSymbolID hgncGeneSymbol, Stri String recordType, Set> primaryIds, Set> secondaryIds, Set ccdsIDs, Set vegaIDs, Set locusSpecificDatabaseNameLinkPairings, - EntrezGeneID suppliedEntrezGeneId, OmimID suppliedOmimId, RefSeqID suppliedRefseqId, - UniProtID suppliedUniprotId, EnsemblGeneID suppliedEnsemblId, UcscGenomeBrowserId suppliedUcscId, - Set suppliedMgiIds, Set suppliedRgdIds, long byteOffset, long lineNumber) { + EntrezGeneID suppliedEntrezGeneId, Set suppliedOmimIds, RefSeqID suppliedRefseqId, + UniProtID suppliedUniprotId, EnsemblGeneID suppliedEnsemblId, VegaID suppliedVegaId, + UcscGenomeBrowserId suppliedUcscId, Set suppliedMgiIds, Set suppliedRgdIds, + long byteOffset, long lineNumber) { super(byteOffset, lineNumber); this.hgncID = hgncID; this.hgncGeneSymbol = hgncGeneSymbol; @@ -365,281 +373,17 @@ public HgncDownloadFileData(HgncID hgncID, HgncGeneSymbolID hgncGeneSymbol, Stri this.vegaIDs = vegaIDs; this.locusSpecificDatabaseNameLinkPairings = locusSpecificDatabaseNameLinkPairings; this.suppliedEntrezGeneId = suppliedEntrezGeneId; - this.suppliedOmimId = suppliedOmimId; + this.suppliedOmimIds = suppliedOmimIds; this.suppliedRefseqId = suppliedRefseqId; this.suppliedUniprotId = suppliedUniprotId; this.suppliedEnsemblId = suppliedEnsemblId; + this.suppliedVegaId = suppliedVegaId; this.suppliedUcscId = suppliedUcscId; this.suppliedMgiIds = suppliedMgiIds; this.suppliedRgdIds = suppliedRgdIds; } - /** - * @return the hgncID - */ - public HgncID getHgncID() { - return hgncID; - } - - /** - * @return the hgncGeneSymbol - */ - public HgncGeneSymbolID getHgncGeneSymbol() { - return hgncGeneSymbol; - } - - /** - * @return the hgncGeneName - */ - public String getHgncGeneName() { - return hgncGeneName; - } - - /** - * @return the status - */ - public String getStatus() { - return status; - } - - /** - * @return the locusType - */ - public String getLocusType() { - return locusType; - } - - /** - * @return the locusGroup - */ - public String getLocusGroup() { - return locusGroup; - } - - /** - * @return the previousSymbols - */ - public Set getPreviousSymbols() { - return previousSymbols; - } - - /** - * @return the previousNames - */ - public Set getPreviousNames() { - return previousNames; - } - - /** - * @return the synonyms - */ - public Set getSynonyms() { - return synonyms; - } - - /** - * @return the nameSynonyms - */ - public Set getNameSynonyms() { - return nameSynonyms; - } - - /** - * @return the chromosome - */ - public String getChromosome() { - return chromosome; - } - - /** - * @return the dateApproved - */ - public String getDateApproved() { - return dateApproved; - } - - /** - * @return the dateModified - */ - public String getDateModified() { - return dateModified; - } - - /** - * @return the dateSymbolChanged - */ - public String getDateSymbolChanged() { - return dateSymbolChanged; - } - - /** - * @return the dateNameChanged - */ - public String getDateNameChanged() { - return dateNameChanged; - } - - /** - * @return the accessionNumbers - */ - public Set> getAccessionNumbers() { - return accessionNumbers; - } - - /** - * @return the ecNumbers - */ - public Set getEcNumbers() { - return ecNumbers; - } - - /** - * @return the entrezGeneID - */ - public EntrezGeneID getEntrezGeneID() { - return entrezGeneID; - } - - /** - * @return the ensemblGeneID - */ - public EnsemblGeneID getEnsemblGeneID() { - return ensemblGeneID; - } - - /** - * @return the mgiID - */ - public Set getMgiIDs() { - return mgiIDs; - } - - /** - * @return the specialistDatabaseIdLinkPairings - */ - public Set getSpecialistDatabaseIdLinkPairings() { - return specialistDatabaseIdLinkPairings; - } - - /** - * @return the pubmedIDs - */ - public Set getPubmedIDs() { - return pubmedIDs; - } - - /** - * @return the refseqIDs - */ - public Set getRefseqIDs() { - return refseqIDs; - } - - /** - * @return the geneFamilyTagDescriptionPairings - */ - public Set getGeneFamilyTagDescriptionPairings() { - return geneFamilyTagDescriptionPairings; - } - - /** - * @return the recordType - */ - public String getRecordType() { - return recordType; - } - - /** - * @return the primaryIds - */ - public Set> getPrimaryIds() { - return primaryIds; - } - - /** - * @return the secondaryIds - */ - public Set> getSecondaryIds() { - return secondaryIds; - } - - /** - * @return the ccdsIDs - */ - public Set getCcdsIDs() { - return ccdsIDs; - } - - /** - * @return the vegaIDs - */ - public Set getVegaIDs() { - return vegaIDs; - } - - /** - * @return the locusSpecificDatabaseNameLinkPairings - */ - public Set getLocusSpecificDatabaseNameLinkPairings() { - return locusSpecificDatabaseNameLinkPairings; - } - - /** - * @return the suppliedEntrezGeneId - */ - public EntrezGeneID getSuppliedEntrezGeneId() { - return suppliedEntrezGeneId; - } - - /** - * @return the suppliedOmimId - */ - public OmimID getSuppliedOmimId() { - return suppliedOmimId; - } - - /** - * @return the suppliedRefseqId - */ - public RefSeqID getSuppliedRefseqId() { - return suppliedRefseqId; - } - - /** - * @return the suppliedUniprotId - */ - public UniProtID getSuppliedUniprotId() { - return suppliedUniprotId; - } - - /** - * @return the suppliedEnsemblId - */ - public EnsemblGeneID getSuppliedEnsemblId() { - return suppliedEnsemblId; - } - - /** - * @return the suppliedUcscId - */ - public UcscGenomeBrowserId getSuppliedUcscId() { - return suppliedUcscId; - } - - /** - * @return the suppliedMgiId - */ - public Set getSuppliedMgiIds() { - return suppliedMgiIds; - } - - /** - * @return the suppliedRgdId - */ - public Set getSuppliedRgdId() { - return suppliedRgdIds; - } - + @Data @Record(dataSource = DataSource.HGNC) public static class SpecialistDbIdLinkPair { @RecordField @@ -656,75 +400,9 @@ public SpecialistDbIdLinkPair(DataSourceIdentifier specialistDbId, String spe this.specialistDbId = specialistDbId; this.specialistDbUrl = specialistDbUrl; } - - /** - * @return the specialistDbId - */ - public DataSourceIdentifier getSpecialistDbId() { - return specialistDbId; - } - - /** - * @return the specialistDbUrl - */ - public String getSpecialistDbUrl() { - return specialistDbUrl; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#hashCode() - */ - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((specialistDbId == null) ? 0 : specialistDbId.hashCode()); - result = prime * result + ((specialistDbUrl == null) ? 0 : specialistDbUrl.hashCode()); - return result; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - SpecialistDbIdLinkPair other = (SpecialistDbIdLinkPair) obj; - if (specialistDbId == null) { - if (other.specialistDbId != null) - return false; - } else if (!specialistDbId.equals(other.specialistDbId)) - return false; - if (specialistDbUrl == null) { - if (other.specialistDbUrl != null) - return false; - } else if (!specialistDbUrl.equals(other.specialistDbUrl)) - return false; - return true; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return "SpecialistDbIdLinkPair [specialistDbId=" + specialistDbId + ", specialistDbUrl=" + specialistDbUrl - + "]"; - } - } + @Data @Record(dataSource = DataSource.HGNC) public static class GeneFamilyTagDescriptionPair { @RecordField @@ -742,74 +420,9 @@ public GeneFamilyTagDescriptionPair(String geneFamilyTag, String geneFamilyDescr this.geneFamilyDescription = geneFamilyDescription; } - /** - * @return the geneFamilyTag - */ - public String getGeneFamilyTag() { - return geneFamilyTag; - } - - /** - * @return the geneFamilyDescription - */ - public String getGeneFamilyDescription() { - return geneFamilyDescription; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#hashCode() - */ - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((geneFamilyDescription == null) ? 0 : geneFamilyDescription.hashCode()); - result = prime * result + ((geneFamilyTag == null) ? 0 : geneFamilyTag.hashCode()); - return result; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - GeneFamilyTagDescriptionPair other = (GeneFamilyTagDescriptionPair) obj; - if (geneFamilyDescription == null) { - if (other.geneFamilyDescription != null) - return false; - } else if (!geneFamilyDescription.equals(other.geneFamilyDescription)) - return false; - if (geneFamilyTag == null) { - if (other.geneFamilyTag != null) - return false; - } else if (!geneFamilyTag.equals(other.geneFamilyTag)) - return false; - return true; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return "GeneFamilyTagDescriptionPair [geneFamilyTag=" + geneFamilyTag + ", geneFamilyDescription=" - + geneFamilyDescription + "]"; - } - } + @Data @Record(dataSource = DataSource.HGNC) public static class LocusSpecificDatabaseNameLinkPair { @RecordField @@ -827,71 +440,6 @@ public LocusSpecificDatabaseNameLinkPair(String databaseName, String link) { this.link = link; } - /** - * @return the databaseName - */ - public String getDatabaseName() { - return databaseName; - } - - /** - * @return the link - */ - public String getLink() { - return link; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#hashCode() - */ - @Override - public int hashCode() { - final int prime = 31; - int result = 1; - result = prime * result + ((databaseName == null) ? 0 : databaseName.hashCode()); - result = prime * result + ((link == null) ? 0 : link.hashCode()); - return result; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#equals(java.lang.Object) - */ - @Override - public boolean equals(Object obj) { - if (this == obj) - return true; - if (obj == null) - return false; - if (getClass() != obj.getClass()) - return false; - LocusSpecificDatabaseNameLinkPair other = (LocusSpecificDatabaseNameLinkPair) obj; - if (databaseName == null) { - if (other.databaseName != null) - return false; - } else if (!databaseName.equals(other.databaseName)) - return false; - if (link == null) { - if (other.link != null) - return false; - } else if (!link.equals(other.link)) - return false; - return true; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Object#toString() - */ - @Override - public String toString() { - return "LocusSpecificDatabaseNameLinkPair [databaseName=" + databaseName + ", link=" + link + "]"; - } - } } diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java index 753a6ae..690b3af 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java @@ -108,7 +108,7 @@ public class HgncDownloadFileParser extends SingleLineFileRecordReader { private static final Logger logger = Logger.getLogger(HgncDownloadFileParser.class); - private static final String HEADER = "HGNC ID\tApproved Symbol\tApproved Name\tStatus\tLocus Type\tLocus Group\tPrevious Symbols\tPrevious Names\tSynonyms\tName Synonyms\tChromosome\tDate Approved\tDate Modified\tDate Symbol Changed\tDate Name Changed\tAccession Numbers\tEnzyme IDs\tEntrez Gene ID\tEnsembl Gene ID\tMouse Genome Database ID\tSpecialist Database Links\tSpecialist Database IDs\tPubmed IDs\tRefSeq IDs\tGene Family Tag\tGene family description\tRecord Type\tPrimary IDs\tSecondary IDs\tCCDS IDs\tVEGA IDs\tLocus Specific Databases\tEntrez Gene ID (supplied by NCBI)\tOMIM ID (supplied by NCBI)\tRefSeq (supplied by NCBI)\tUniProt ID (supplied by UniProt)\tEnsembl ID (supplied by Ensembl)\tUCSC ID (supplied by UCSC)\tMouse Genome Database ID (supplied by MGI)\tRat Genome Database ID (supplied by RGD)"; + private static final String HEADER = "HGNC ID\tApproved Symbol\tApproved Name\tStatus\tLocus Type\tLocus Group\tPrevious Symbols\tPrevious Names\tSynonyms\tName Synonyms\tChromosome\tDate Approved\tDate Modified\tDate Symbol Changed\tDate Name Changed\tAccession Numbers\tEnzyme IDs\tEntrez Gene ID\tEnsembl Gene ID\tMouse Genome Database ID\tSpecialist Database Links\tSpecialist Database IDs\tPubmed IDs\tRefSeq IDs\tGene Family Tag\tGene family description\tRecord Type\tPrimary IDs\tSecondary IDs\tCCDS IDs\tVEGA IDs\tLocus Specific Databases\tEntrez Gene ID (supplied by NCBI)\tOMIM ID (supplied by NCBI)\tRefSeq (supplied by NCBI)\tUniProt ID (supplied by UniProt)\tEnsembl ID (supplied by Ensembl)\tVega ID (supplied by Vega)\tUCSC ID (supplied by UCSC)\tMouse Genome Database ID (supplied by MGI)\tRat Genome Database ID (supplied by RGD)"; public enum WithdrawnRecordTreatment { IGNORE, INCLUDE @@ -153,7 +153,7 @@ protected String getExpectedFileHeader() throws IOException { @Override protected HgncDownloadFileData parseRecordFromLine(Line line) { String[] toks = line.getText().split("\\t", -1); - if (toks.length == 40) { + if (toks.length == 41) { int column = 0; HgncID hgncID = new HgncID(toks[column++]); HgncGeneSymbolID hgncGeneSymbol = new HgncGeneSymbolID(toks[column++]); @@ -326,13 +326,11 @@ protected HgncDownloadFileData parseRecordFromLine(Line line) { suppliedEntrezGeneId = new EntrezGeneID(columnValue); } - OmimID suppliedOmimId = null; + Set suppliedOmimIds = new HashSet(); columnValue = toks[column++]; if (!columnValue.isEmpty()) { - try { - suppliedOmimId = new OmimID(columnValue); - } catch (IllegalArgumentException iae) { - logger.warn(iae); + for (String tok : columnValue.split(",")) { + suppliedOmimIds.add(new OmimID(tok.trim())); } } @@ -354,6 +352,12 @@ protected HgncDownloadFileData parseRecordFromLine(Line line) { suppliedEnsemblId = new EnsemblGeneID(columnValue); } + VegaID suppliedVegaId = null; + columnValue = toks[column++]; + if (!columnValue.isEmpty()) { + suppliedVegaId = new VegaID(columnValue); + } + UcscGenomeBrowserId suppliedUcscId = null; columnValue = toks[column++]; if (!columnValue.isEmpty()) { @@ -381,11 +385,12 @@ protected HgncDownloadFileData parseRecordFromLine(Line line) { dateSymbolChanged, dateNameChanged, accessionNumbers, ecNumbers, entrezGeneId, ensemblGeneID, mgiIDs, specialistDatabaseLinks, pubmedIDs, refseqIDs, geneFamilyTagDescriptionPairs, recordType, primaryIds, secondaryIds, ccdsIds, vegaIds, locusSpecificDatabaseNameLinkPairs, - suppliedEntrezGeneId, suppliedOmimId, suppliedRefseqId, suppliedUniProtId, suppliedEnsemblId, - suppliedUcscId, suppliedMgiIds, suppliedRgdIds, line.getByteOffset(), line.getLineNumber()); + suppliedEntrezGeneId, suppliedOmimIds, suppliedRefseqId, suppliedUniProtId, suppliedEnsemblId, + suppliedVegaId, suppliedUcscId, suppliedMgiIds, suppliedRgdIds, line.getByteOffset(), + line.getLineNumber()); } - logger.error("Unexpected number of tokens (" + toks.length + "; expected 40) on line: " + logger.error("Unexpected number of tokens (" + toks.length + "; expected 41) on line: " + line.getText().replaceAll("\\t", " [TAB] ")); return null; @@ -564,7 +569,8 @@ private Set> resolveAccessionNumbers(String accListStr) Set> accNumbers = new HashSet>(); if (!accListStr.isEmpty()) { for (String acc : accListStr.split(",")) { - DataSourceIdentifier nucAccId = NucleotideAccessionResolver.resolveNucleotideAccession(acc, acc); + DataSourceIdentifier nucAccId = NucleotideAccessionResolver + .resolveNucleotideAccession(acc, acc); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId)) { DataSourceIdentifier proAccId = ProteinAccessionResolver.resolveProteinAccession(acc, acc); accNumbers.add(proAccId); diff --git a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java index e96d636..ff28862 100644 --- a/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java +++ b/datasource-fileparsers/src/test/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParserTest.java @@ -41,6 +41,7 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; @@ -62,6 +63,7 @@ import edu.ucdenver.ccp.datasource.identifiers.mgi.MgiGeneID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.CcdsId; import edu.ucdenver.ccp.datasource.identifiers.ncbi.gene.EntrezGeneID; +import edu.ucdenver.ccp.datasource.identifiers.ncbi.omim.OmimID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.refseq.RefSeqID; import edu.ucdenver.ccp.datasource.identifiers.other.CosmicId; import edu.ucdenver.ccp.datasource.identifiers.other.UcscGenomeBrowserId; @@ -127,13 +129,13 @@ public void testParser() throws Exception { assertEmpty(dataRecord.getVegaIDs()); assertEmpty(dataRecord.getLocusSpecificDatabaseNameLinkPairings()); assertEquals(new EntrezGeneID(503538), dataRecord.getSuppliedEntrezGeneId()); - assertNull(dataRecord.getSuppliedOmimId()); + assertEmpty(dataRecord.getSuppliedOmimIds()); assertEquals(new RefSeqID("NR_015380"), dataRecord.getSuppliedRefseqId()); assertNull(dataRecord.getSuppliedUniprotId()); assertNull(dataRecord.getSuppliedEnsemblId()); assertEquals(new UcscGenomeBrowserId("uc002qsg.3"), dataRecord.getSuppliedUcscId()); assertEmpty(dataRecord.getSuppliedMgiIds()); - assertEmpty(dataRecord.getSuppliedRgdId()); + assertEmpty(dataRecord.getSuppliedRgdIds()); } else { fail("Parser should have returned the first record."); } @@ -184,13 +186,13 @@ public void testParser() throws Exception { "ALSOD, the Amyotrophic Lateral Sclerosis Online Genetic Database", "http://alsod.iop.kcl.ac.uk/")), dataRecord.getLocusSpecificDatabaseNameLinkPairings()); assertEquals(new EntrezGeneID(29974), dataRecord.getSuppliedEntrezGeneId()); - assertNull(dataRecord.getSuppliedOmimId()); + assertEmpty(dataRecord.getSuppliedOmimIds()); assertEquals(new RefSeqID("NM_001198818"), dataRecord.getSuppliedRefseqId()); assertEquals(new UniProtID("Q9NQ94"), dataRecord.getSuppliedUniprotId()); assertEquals(new EnsemblGeneID("ENSG00000148584"), dataRecord.getSuppliedEnsemblId()); assertEquals(new UcscGenomeBrowserId("uc001jjj.3"), dataRecord.getSuppliedUcscId()); assertEquals(CollectionsUtil.createSet(new MgiGeneID("MGI:1917115")), dataRecord.getSuppliedMgiIds()); - assertEquals(CollectionsUtil.createSet(new RgdID("619834")), dataRecord.getSuppliedRgdId()); + assertEquals(CollectionsUtil.createSet(new RgdID("619834")), dataRecord.getSuppliedRgdIds()); } else { fail("Parser should have returned the first record."); } diff --git a/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/hgnc/hgnc_download.txt b/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/hgnc/hgnc_download.txt index 4217a78..94b5606 100644 --- a/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/hgnc/hgnc_download.txt +++ b/datasource-fileparsers/src/test/resources/edu/ucdenver/ccp/datasource/fileparsers/hgnc/hgnc_download.txt @@ -1,3 +1,3 @@ -HGNC ID Approved Symbol Approved Name Status Locus Type Locus Group Previous Symbols Previous Names Synonyms Name Synonyms Chromosome Date Approved Date Modified Date Symbol Changed Date Name Changed Accession Numbers Enzyme IDs Entrez Gene ID Ensembl Gene ID Mouse Genome Database ID Specialist Database Links Specialist Database IDs Pubmed IDs RefSeq IDs Gene Family Tag Gene family description Record Type Primary IDs Secondary IDs CCDS IDs VEGA IDs Locus Specific Databases Entrez Gene ID (supplied by NCBI) OMIM ID (supplied by NCBI) RefSeq (supplied by NCBI) UniProt ID (supplied by UniProt) Ensembl ID (supplied by Ensembl) UCSC ID (supplied by UCSC) Mouse Genome Database ID (supplied by MGI) Rat Genome Database ID (supplied by RGD) -HGNC:37133 A1BG-AS1 A1BG antisense RNA 1 Approved RNA, long non-coding non-coding RNA NCRNA00181, A1BGAS, A1BG-AS "non-protein coding RNA 181", "A1BG antisense RNA (non-protein coding)", "A1BG antisense RNA 1 (non-protein coding)" FLJ23569 19q13.4 2009-07-20 2012-10-12 2010-11-25 2012-08-15 BC040926 503538 , , , , , , , , , , , , , , , , NR_015380 LNCRNA, ANTISENSE "-", "ncRNAs / Long non-coding RNAs, antisense" Standard 503538 NR_015380 uc002qsg.3 -HGNC:24086 A1CF APOBEC1 complementation factor Approved gene with protein product protein-coding gene ACF, ASP, ACF64, ACF65, APOBEC1CF 10q21.1 2007-11-23 2011-07-21 AF271790 29974 ENSG00000148584 MGI:1917115 COSMIC , , , , , , , , , , A1CF, , , , , , 11815617, 11072063 NM_014576 Standard CCDS7241.1, CCDS7242.1, CCDS7243.1 OTTHUMG00000018240 Androgen Receptor|http://androgendb.mcgill.ca/,Mental Retardation database|http://grenada.lumc.nl/LOVD2/MR/home.php?select_db=AR,ALSOD, the Amyotrophic Lateral Sclerosis Online Genetic Database|http://alsod.iop.kcl.ac.uk/ 29974 NM_001198818 Q9NQ94 ENSG00000148584 uc001jjj.3 MGI:1917115 RGD:619834 +HGNC ID Approved Symbol Approved Name Status Locus Type Locus Group Previous Symbols Previous Names Synonyms Name Synonyms Chromosome Date Approved Date Modified Date Symbol Changed Date Name Changed Accession Numbers Enzyme IDs Entrez Gene ID Ensembl Gene ID Mouse Genome Database ID Specialist Database Links Specialist Database IDs Pubmed IDs RefSeq IDs Gene Family Tag Gene family description Record Type Primary IDs Secondary IDs CCDS IDs VEGA IDs Locus Specific Databases Entrez Gene ID (supplied by NCBI) OMIM ID (supplied by NCBI) RefSeq (supplied by NCBI) UniProt ID (supplied by UniProt) Ensembl ID (supplied by Ensembl) Vega ID (supplied by Vega) UCSC ID (supplied by UCSC) Mouse Genome Database ID (supplied by MGI) Rat Genome Database ID (supplied by RGD) +HGNC:37133 A1BG-AS1 A1BG antisense RNA 1 Approved RNA, long non-coding non-coding RNA NCRNA00181, A1BGAS, A1BG-AS "non-protein coding RNA 181", "A1BG antisense RNA (non-protein coding)", "A1BG antisense RNA 1 (non-protein coding)" FLJ23569 19q13.4 2009-07-20 2012-10-12 2010-11-25 2012-08-15 BC040926 503538 , , , , , , , , , , , , , , , , NR_015380 LNCRNA, ANTISENSE "-", "ncRNAs / Long non-coding RNAs, antisense" Standard 503538 NR_015380 uc002qsg.3 +HGNC:24086 A1CF APOBEC1 complementation factor Approved gene with protein product protein-coding gene ACF, ASP, ACF64, ACF65, APOBEC1CF 10q21.1 2007-11-23 2011-07-21 AF271790 29974 ENSG00000148584 MGI:1917115 COSMIC , , , , , , , , , , A1CF, , , , , , 11815617, 11072063 NM_014576 Standard CCDS7241.1, CCDS7242.1, CCDS7243.1 OTTHUMG00000018240 Androgen Receptor|http://androgendb.mcgill.ca/,Mental Retardation database|http://grenada.lumc.nl/LOVD2/MR/home.php?select_db=AR,ALSOD, the Amyotrophic Lateral Sclerosis Online Genetic Database|http://alsod.iop.kcl.ac.uk/ 29974 NM_001198818 Q9NQ94 ENSG00000148584 uc001jjj.3 MGI:1917115 RGD:619834 From 1df77efcfdb493c0153e2a11e6f9c48a67043764 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 12:48:20 -0700 Subject: [PATCH 29/36] Added handling for Vega ID resolution --- .../ccp/datasource/identifiers/DataSourceIdResolver.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java index 08de587..6b0fddd 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSourceIdResolver.java @@ -446,6 +446,8 @@ else if (geneIDStr.matches("rs\\d+")) return new SnpRsId(geneIDStr); else if (geneIDStr.startsWith("CL:")) return new CellTypeOntologyID(geneIDStr); + else if (geneIDStr.startsWith("Vega:")) + return new VegaID(StringUtil.removePrefix(geneIDStr, "Vega:")); else if (geneIDStr.startsWith("NCBITaxon:")) return new NcbiTaxonomyID(StringUtil.removePrefix(geneIDStr, "NCBITaxon:")); From d34c2eebfb4a3c4cd5b23deb4d3b03fd30b6cdd1 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 13:19:27 -0700 Subject: [PATCH 30/36] Added handling for multiple mesh IDs for a given category --- .../fileparsers/drugbank/DrugBankDrugRecord.java | 16 ++++++++++++---- .../ccp/datasource/identifiers/ncbi/MeshID.java | 6 ++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java index 9c0498b..eb3c385 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugBankDrugRecord.java @@ -932,11 +932,17 @@ private Set initCategories(CategoryListType list) { } Set toReturn = new HashSet(); for (CategoryType p : list.getCategory()) { - MeshID meshId = null; + Set meshIds = new HashSet(); if (!p.getMeshId().trim().isEmpty()) { - meshId = new MeshID(p.getMeshId().trim()); + String meshStr = p.getMeshId().trim(); + meshStr = meshStr.replaceAll("\"", ""); + meshStr = meshStr.replace("[", ""); + meshStr = meshStr.replace("]", ""); + for (String tok : meshStr.split(",")) { + meshIds.add(new MeshID(tok)); + } } - Category c = new Category(meshId, p.getCategory()); + Category c = new Category(meshIds, p.getCategory()); toReturn.add(c); } return toReturn; @@ -946,7 +952,7 @@ private Set initCategories(CategoryListType list) { @Record(dataSource = DataSource.DRUGBANK) private static class Category { @RecordField - private final MeshID meshId; + private final Set meshIds; @RecordField private final String category; } @@ -1308,6 +1314,8 @@ private static DataSourceIdentifier resolveIdentifier(String resource, String "GenBank:" + identifier); if (ProbableErrorDataSourceIdentifier.class.isInstance(nucAccId.getClass())) { return ProteinAccessionResolver.resolveProteinAccession(identifier, "GenBank:" + identifier); + } else { + return nucAccId; } } else if (resource.equals("UniProtKB")) { return new UniProtID(identifier); diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ncbi/MeshID.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ncbi/MeshID.java index b6dbb98..9c735b5 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ncbi/MeshID.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/ncbi/MeshID.java @@ -33,10 +33,8 @@ * #L% */ -import org.apache.commons.lang.math.NumberUtils; - -import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; import edu.ucdenver.ccp.datasource.identifiers.DataSource; +import edu.ucdenver.ccp.datasource.identifiers.DataSourceIdentifier; /** * ID for Medical Subject Heading definition as described by www.nlm.nih.gov/mesh @@ -57,7 +55,7 @@ public MeshID(String resourceID) { @Override public String validate(String resourceID) throws IllegalArgumentException { - if (resourceID != null && resourceID.matches("[A-Z]\\d+")) + if (resourceID != null) return resourceID; throw new IllegalArgumentException(String.format("Invalid Mesh ID : %s", resourceID)); From 2a68c66186bd9ae8285bb4bdeb81fcb265aae5f7 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 14:40:14 -0700 Subject: [PATCH 31/36] Added handling for RNACentral identifiers --- .../GpAssociationGoaUniprotFileParser.java | 4 ++ .../datasource/identifiers/DataSource.java | 1 + .../identifiers/other/RnaCentralId.java | 51 +++++++++++++++++++ 3 files changed, 56 insertions(+) create mode 100644 datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/other/RnaCentralId.java diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java index a38fb15..a8687c6 100755 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java @@ -63,6 +63,7 @@ import edu.ucdenver.ccp.datasource.identifiers.ebi.uniprot.UniProtIsoformID; import edu.ucdenver.ccp.datasource.identifiers.ncbi.taxonomy.NcbiTaxonomyID; import edu.ucdenver.ccp.datasource.identifiers.obo.GeneOntologyID; +import edu.ucdenver.ccp.datasource.identifiers.other.RnaCentralId; import edu.ucdenver.ccp.datasource.identifiers.reactome.ReactomeReactionID; import edu.ucdenver.ccp.identifier.publication.DOI; import edu.ucdenver.ccp.identifier.publication.PubMedID; @@ -317,6 +318,9 @@ private static DataSourceIdentifier createDatabaseObjectID(String database, S if (database.equals("IntAct")) { return new IntActID(databaseObjectIDStr); } + if (database.equals("RNAcentral")) { + return new RnaCentralId(databaseObjectIDStr); + } } catch (IllegalArgumentException e) { logger.warn(e.getMessage()); } diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java index a214cd7..c9398ce 100644 --- a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/DataSource.java @@ -341,6 +341,7 @@ public String getLocalName() { PROTONET("http://www.protonet.cs.huji.ac.il/"), REBASE("http://rebase.neb.com/"), REPRODUCTION_2DPAGE("http://reprod.njmu.edu.cn/"), + RNACENTRAL("http://rnacentral.org/rna/"), ROUGE("http://www.kazusa.or.jp/rouge/"), SABIO_RK("http://sabiork.h-its.org/"), SBKB("http://sbkb.org/"), diff --git a/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/other/RnaCentralId.java b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/other/RnaCentralId.java new file mode 100644 index 0000000..8d17c58 --- /dev/null +++ b/datasource-identifiers/src/main/java/edu/ucdenver/ccp/datasource/identifiers/other/RnaCentralId.java @@ -0,0 +1,51 @@ +package edu.ucdenver.ccp.datasource.identifiers.other; + +/* + * #%L + * Colorado Computational Pharmacology's common module + * %% + * Copyright (C) 2012 - 2014 Regents of the University of Colorado + * %% + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors + * may be used to endorse or promote products derived from this software without + * specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * #L% + */ + +import edu.ucdenver.ccp.datasource.identifiers.DataSource; +import edu.ucdenver.ccp.datasource.identifiers.StringDataSourceIdentifier; + +/** + * http://www.nal.usda.gov/ + * + * @author Colorado Computational Pharmacology, UC Denver; ccpsupport@ucdenver.edu + * + */ +public class RnaCentralId extends StringDataSourceIdentifier { + + public RnaCentralId(String resourceID) { + super(resourceID, DataSource.RNACENTRAL); +} + +} From cb1faa9483e6eb0c4895b07982f8366f40706401 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Tue, 9 Feb 2016 14:59:46 -0700 Subject: [PATCH 32/36] removed a print statement --- .../ebi/uniprot/SparseUniProtXmlFileRecordReader.java | 1 - 1 file changed, 1 deletion(-) diff --git a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java index ff3fc07..05b99ab 100644 --- a/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java +++ b/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SparseUniProtXmlFileRecordReader.java @@ -97,7 +97,6 @@ protected boolean hasTaxonOfInterest(SparseUniProtFileRecord record) { } for (DbReference dbRef : record.getOrganism().getDbReference()) { if (getTaxonsOfInterest().contains(dbRef.getId())) { - System.out.println("has taxon of interest: " + dbRef.getId()); return true; } } From b4404d9b982f7bca74f18df1e767c74167b97e34 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Thu, 11 Feb 2016 08:37:43 -0700 Subject: [PATCH 33/36] updating poms for 0.6.1 branch with snapshot versions --- datasource-fileparsers/pom.xml | 2 +- datasource-identifiers/pom.xml | 2 +- datasource-rdfizer/pom.xml | 2 +- pom.xml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml index 55a633d..8cffda9 100644 --- a/datasource-fileparsers/pom.xml +++ b/datasource-fileparsers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.7-SNAPSHOT + 0.6.1-SNAPSHOT datasource-fileparsers diff --git a/datasource-identifiers/pom.xml b/datasource-identifiers/pom.xml index ef3917f..c12811c 100644 --- a/datasource-identifiers/pom.xml +++ b/datasource-identifiers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.7-SNAPSHOT + 0.6.1-SNAPSHOT datasource-identifiers diff --git a/datasource-rdfizer/pom.xml b/datasource-rdfizer/pom.xml index 1e4b68f..728b501 100644 --- a/datasource-rdfizer/pom.xml +++ b/datasource-rdfizer/pom.xml @@ -3,7 +3,7 @@ edu.ucdenver.ccp datasource - 0.7-SNAPSHOT + 0.6.1-SNAPSHOT datasource-rdfizer diff --git a/pom.xml b/pom.xml index dd3e590..76a1578 100644 --- a/pom.xml +++ b/pom.xml @@ -2,7 +2,7 @@ 4.0.0 edu.ucdenver.ccp datasource - 0.7-SNAPSHOT + 0.6.1-SNAPSHOT pom From 1d0533c9a9975224b738d3ffdaf5ba45b1858cbf Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Thu, 11 Feb 2016 14:28:55 -0700 Subject: [PATCH 34/36] Overhaul of README now lists most of the available parsers and has improved documentation for RDF generation --- README.md | 226 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 136 insertions(+), 90 deletions(-) diff --git a/README.md b/README.md index 46c6654..4f4607f 100644 --- a/README.md +++ b/README.md @@ -1,27 +1,28 @@ -# datasource -A library of code for parsing (mostly biomedical) data source files and converting their contents to RDF +A library of code for parsing (mostly biomedical) data source files -This library contains file parsers for files from many different biomedical databases. It also contains -code that uses a file parser as input and outputs RDF. The structure of the RDF is described in: -``` -KaBOB: Ontology-Based Semantic Integration of Biomedical Databases -Kevin M Livingston, Michael Bada, William A Baumgartner, Lawrence E Hunter -BMC Bioinformatics (accepted) -``` - -## Development -This project follows the Git-Flow approach to branching as originally described [here](http://nvie.com/posts/a-successful-git-branching-model/). -To facilitate the Git-Flow branching approach, this project makes use of the [jgitflow-maven-plugin](https://bitbucket.org/atlassian/jgit-flow) as described [here](http://george-stathis.com/2013/11/09/painless-maven-project-releases-with-maven-gitflow-plugin/). - -Code in the [master branch](https://github.com/UCDenver-ccp/datasource/tree/master) reflects the latest release of this library. Code in the [development](https://github.com/UCDenver-ccp/datasource/tree/development) branch contains the most up-to-date version of this project. +# Prerequisites + * [Java](https://www.oracle.com/java/index.html), at least version 8, is required. + * [Apache Maven](https://maven.apache.org/) is required to build the project. + * If you +intend to build this project inside of an IDE, such as Eclipse, please see the instructions +for using the [Lombok](https://projectlombok.org/) library with your IDE [here](https://projectlombok.org/features/index.html). +# Installation +To use the scripts included in this project, e.g. to generate an RDF representation for a given datasource from the command line, you must download and install the project: +``` +$ git clone https://github.com/UCDenver-ccp/datasource datasource.git +$ cd datasource.git +$ mvn clean install +``` +Scripts must be run from the project's base directory. -## Maven signature if only using the file parser API +If you are interested in programmatic access to the file parsers and related code, the libraries are available as Maven artifacts: +#### Maven signature if only using the file parser API ```xml edu.ucdenver.ccp datasource-fileparsers - 0.6 + 0.6.1 @@ -30,12 +31,12 @@ Code in the [master branch](https://github.com/UCDenver-ccp/datasource/tree/mast ``` -## Maven signature if interested in generating RDF of parsed file content +#### Maven signature if interested in generating RDF of parsed file content ```xml edu.ucdenver.ccp datasource-rdfizer - 0.6 + 0.6.1 @@ -44,75 +45,116 @@ Code in the [master branch](https://github.com/UCDenver-ccp/datasource/tree/mast ``` -## Bulk RDF Generation +# Development +This project follows the Git-Flow approach to branching as originally described [here](http://nvie.com/posts/a-successful-git-branching-model/). +To facilitate the Git-Flow branching approach, this project makes use of the [jgitflow-maven-plugin](https://bitbucket.org/atlassian/jgit-flow) as described [here](http://george-stathis.com/2013/11/09/painless-maven-project-releases-with-maven-gitflow-plugin/). -This library has been built to work easily with distributed resource management -systems such as Oracle Grid Engine or Torque. This simply means that there is a -script to download and process (generate RDF triples) the data for a source: +Code in the [master branch](https://github.com/UCDenver-ccp/datasource/tree/master) reflects the latest release (v0.6.1) of this library. Code in the [development](https://github.com/UCDenver-ccp/datasource/tree/development) branch contains the most up-to-date version of this project. + +# Available file parsers +This library contains file parsers for files from many different biomedical databases. The table below lists the datasources, files, and relevant file parser class. +Many of the file parsers are capable of automatically downloading the file that they parse. Those files that cannot be downloaded automatically typically require registration, login, or a user-specific license. +The "Download" column is used to indicate which files cannot be downloaded automatically. This list is not guaranteed to be exhaustive. + +| Data source | File | Parser class | RDF Generation Key | Download | +|---|---|---|---|---| +| [DIP](http://dip.doe-mbi.ucla.edu/dip/Main.cgi) | dip{DATE}.txt.gz | [DipYYYYMMDDFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/dip/DipYYYYMMDDFileParser.java) | | MANUAL | +| [DrugBank](http://www.drugbank.ca/) | [drugbank.xml](http://www.drugbank.ca/downloads) | [DrugbankXmlFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/drugbank/DrugbankXmlFileRecordReader.java) | DRUGBANK | AUTO | +| [Gene Ontology](http://geneontology.org/) | [annotation files](http://geneontology.org/page/download-annotations) | [GeneAssociationFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/geneontology/GeneAssociationFileParser.java) | | AUTO | +| [GOA](http://www.ebi.ac.uk/GOA) | [gp_association.goa_uniprot.gz](http://www.ebi.ac.uk/GOA/downloads) | [GpAssociationGoaUniprotFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/goa/GpAssociationGoaUniprotFileParser.java) | GOA | AUTO | +| [HGNC](http://www.genenames.org/) | [hgnc_complete_set.txt.gz](http://www.genenames.org/cgi-bin/statistics) | [HgncDownloadFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/hgnc/HgncDownloadFileParser.java) | HGNC | AUTO | +| [InterPro](http://www.ebi.ac.uk/interpro/) | [interpro2go](ftp://ftp.ebi.ac.uk/pub/databases/interpro/) | [InterPro2GoFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterPro2GoFileParser.java) | INTERPRO_INTERPRO2GO | AUTO | +| [InterPro](http://www.ebi.ac.uk/interpro/) | [names.dat](ftp://ftp.ebi.ac.uk/pub/databases/interpro/) | [InterProNamesDatFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProNamesDatFileParser.java) | INTERPRO_NAMESDAT | AUTO | +| [InterPro](http://www.ebi.ac.uk/interpro/) | [protein2ipr.dat.gz](ftp://ftp.ebi.ac.uk/pub/databases/interpro/) | [InterProProtein2IprDatFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/interpro/InterProProtein2IprDatFileParser.java) | INTERPRO_PROTEIN2IPR | AUTO | +| [IRefWeb](http://wodaklab.org/iRefWeb/) | [All.mitab.{DATE}.txt.zip](http://irefindex.org/download/irefindex/data/archive/release_14.0/psi_mitab/MITAB2.6/) | [IRefWebPsiMitab2_6FileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/irefweb/IRefWebPsiMitab2_6FileParser.java) | IREFWEB | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MGI_EntrezGene.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MGIEntrezGeneFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MGIEntrezGeneFileParser.java) | MGI_ENTREZGENE | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MGI_Geno_Disease.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MGIGenoDiseaseFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MGIGenoDiseaseFileRecordReader.java) | |AUTO | +| [MGI](http://www.informatics.jax.org/) | [MGI_PhenoGenoMP.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MGIPhenoGenoMPFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MGIPhenoGenoMPFileParser.java) | MGI_MGIPHENOGENO | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MRK_List2.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MRKListFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKListFileParser.java) | MGI_MRKLIST | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MRK_Reference.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MRKReferenceFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKReferenceFileParser.java) | MGI_MRKREFERENCE | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MRK_Sequence.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MRKSequenceFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSequenceFileParser.java) | MGI_MRKSEQUENCE | AUTO | +| [MGI](http://www.informatics.jax.org/) | [MRK_SwissProt.rpt](ftp://ftp.informatics.jax.org/pub/reports/index.html) | [MRKSwissProtFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mgi/MRKSwissProtFileParser.java) | MGI_MRKSWISSPROT | AUTO | +| [miRBase](http://www.mirbase.org/) | [miRNA.dat.gz](http://www.mirbase.org/ftp.shtml) | [MirBaseMiRnaDatFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/mirbase/MirBaseMiRnaDatFileParser.java) | MIRBASE | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [gene2accession.gz](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGene2AccessionFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2AccessionFileParser.java) | | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [gene2pubmed.gz](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGene2PubmedFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2PubmedFileParser.java) | | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [gene2refseq.gz](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGene2RefseqFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGene2RefseqFileParser.java) | NCBIGENE_GENE2REFSEQ | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [gene_info.gz](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGeneInfoFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneInfoFileParser.java) | NCBIGENE_GENEINFO | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [mim2gene_medgen](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGeneMim2GeneFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneMim2GeneFileParser.java) | NCBIGENE_MIM2GENE | AUTO | +| [NCBI Gene](http://www.ncbi.nlm.nih.gov/gene) | [gene_refseq_uniprotkb_collab.gz](ftp://ftp.ncbi.nih.gov/gene/DATA/) | [EntrezGeneRefSeqUniprotKbCollabFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/gene/EntrezGeneRefSeqUniprotKbCollabFileParser.java) | NCBIGENE_REFSEQUNIPROTCOLLAB | AUTO | +| [NCBI Homologene](http://www.ncbi.nlm.nih.gov/homologene) | [homologene.data](ftp://ftp.ncbi.nih.gov/pub/HomoloGene/current) | [HomoloGeneDataFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/homologene/HomoloGeneDataFileParser.java) | HOMOLOGENE | AUTO | +| [NCBI RefSeq](http://www.ncbi.nlm.nih.gov/refseq/) | [RefSeq-release{##}.catalog.gz](ftp://ftp.ncbi.nlm.nih.gov/refseq/release/release-catalog/) | [RefSeqReleaseCatalogFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ncbi/refseq/RefSeqReleaseCatalogFileParser.java) | REFSEQ_RELEASECATALOG | AUTO | +| [PharmGKB](https://www.pharmgkb.org/) | [diseases.tsv](https://www.pharmgkb.org/downloads/) | [PharmGkbDiseaseFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbDiseaseFileParser.java) | PHARMGKB_DISEASE | AUTO | +| [PharmGKB](https://www.pharmgkb.org/) | [drugs.tsv](https://www.pharmgkb.org/downloads/) | [PharmGkbDrugFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbDrugFileParser.java) | PHARMGKB_DRUG | AUTO | +| [PharmGKB](https://www.pharmgkb.org/) | [genes.tsv](https://www.pharmgkb.org/downloads/) | [PharmGkbGeneFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbGeneFileParser.java) | PHARMGKB_GENE | AUTO | +| [PharmGKB](https://www.pharmgkb.org/) | relations.tsv | [PharmGkbRelationFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pharmgkb/PharmGkbRelationFileParser.java) | PHARMGKB_RELATION | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Acetylation_site_dataset.gz | [AcetylationPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/AcetylationPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Disease-associated_sites.gz | [DiseasePhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/DiseasePhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Kinase_Substrate_Dataset.gz | [KinasePhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/KinasePhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Methylation_site_dataset.gz | [MethylationPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/MethylationPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | O-GalNAc_site_dataset.gz | [OGalNAcPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/OGalNAcPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | O-GlcNAc_site_dataset.gz | [OGlcNAcPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/OGlcNAcPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Phosphorylation_site_dataset.gz | [PhosphorylationPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/PhosphorylationPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Regulatory_sites.gz | [RegulatoryPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/RegulatoryPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Sumoylation_site_dataset.gz | [SumoylationPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/SumoylationPhosphositeFileParser.java) | | MANUAL | +| [PhosphoSite](http://www.phosphosite.org/homeAction.action) | Ubiquitination_site_dataset.gz | [UbiquitinationPhosphositeFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/phosphosite/UbiquitinationPhosphositeFileParser.java) | | MANUAL | +| [PreMod](http://genomequebec.mcgill.ca/PReMod/) | [human_module_tab.txt.gz](http://genomequebec.mcgill.ca/PReMod/download) | [HumanPReModModuleTabFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/premod/HumanPReModModuleTabFileParser.java) | PREMOD_HUMAN | AUTO | +| [PreMod](http://genomequebec.mcgill.ca/PReMod/) | [mouse_module_tab.txt.gz](http://genomequebec.mcgill.ca/PReMod/download) | [MousePReModModuleTabFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/premod/MousePReModModuleTabFileParser.java) | PREMOD_MOUSE | AUTO | +| [Protein Ontology](http://pir.georgetown.edu/pro/) | [promapping.txt](ftp://ftp.pir.georgetown.edu/databases/ontology/pro_obo/PRO_mappings/) | [ProMappingFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/pro/ProMappingFileParser.java) | PR_MAPPINGFILE | AUTO | +| [Reactome](http://www.reactome.org/) | [UniProt2Reactome.txt](http://www.reactome.org/pages/download-data/) | [ReactomeUniprot2PathwayStidTxtFileParser](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/reactome/ReactomeUniprot2PathwayStidTxtFileParser.java) | REACTOME_UNIPROT2PATHWAYSTID | AUTO | +| [RGD](http://rgd.mcw.edu/) | [GENES_RAT.txt](ftp://ftp.rgd.mcw.edu/pub/data_release/) | [RgdRatGeneFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/rgd/RgdRatGeneFileRecordReader.java) | RGD_GENES | AUTO | +| [UniProt](http://www.uniprot.org/) | [uniprot_sprot.xml.gz](http://www.uniprot.org/downloads) | [SwissProtXmlFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/SwissProtXmlFileRecordReader.java) | UNIPROT_SWISSPROT | AUTO | +| [UniProt](http://www.uniprot.org/) | [uniprot_trembl.xml.gz](http://www.uniprot.org/downloads) | [TremblXmlFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/TremblXmlFileRecordReader.java) | UNIPROT_TREMBL_SPARSE | AUTO | +| [UniProt](http://www.uniprot.org/) | [idmapping_selected.tab.gz](http://www.uniprot.org/downloads) | [UniProtIDMappingFileRecordReader](https://github.com/UCDenver-ccp/datasource/blob/master/datasource-fileparsers/src/main/java/edu/ucdenver/ccp/datasource/fileparsers/ebi/uniprot/UniProtIDMappingFileRecordReader.java) | UNIPROT_IDMAPPING | AUTO | + + + +# Generating RDF representations of parsed database files +This library also contains code that can convert file parser output into a structured database record/field representation using RDF. + +The structure of the RDF is described in: ``` -datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh -``` - -#### Integer-to-File mappings +KaBOB: Ontology-Based Semantic Integration of Biomedical Databases +Kevin M Livingston, Michael Bada, William A Baumgartner, Lawrence E Hunter +BMC Bioinformatics (accepted) +``` +And the generated RDF serves as a foundation for the [KaBOB Knowledge Base of Biology](https://github.com/UCDenver-ccp/kabob). +Detailed instructions on how to generate RDF to feed into KaBOB can be found below and [here](https://github.com/UCDenver-ccp/kabob/wiki/Building-a-Knowledgebase-instance). -To see the integer-to-file mappings, run: -``` -datasource-rdfizer/scripts/list-download-file-indices.sh +The following script can be used to generate RDF representation for a given data source file: ``` +datasource-rdfizer/scripts/download-datasources-and-generate-triples.sh -Note that due to licensing issues, some files are not available for download -directly. The resources denoted in italics below must be manually obtained in -order to be used. Those resources not listed in italics are capable of being -automatically downloaded at RDF generation time. - +Parameters: + [-d]: The directory into which to place the downloaded datasource files. + [-r]: The directory into which to place the RDF triples parsed from the + datasource files. + [-i]: The names of the datasources (comma-delimited) to download and process; + if not specified, all available datasources will be downloaded and + processed. These names are listed in the "RDF Generation Key" column in + the table above. + [-t]: A comma-separated list of NCBI taxonomy IDs. Only records for these IDs + will be included in the RDF triple output where applicable. If neither + -t nor -m is specified, all records will be included. + [-m]: Include only human and the 7 model organisms (fly, rat, mouse, yeast, + worm, arabidopsis, and zebrafish) in the generated RDF. If neither -t + nor -m is specified, all records will be included. + [-c]: Clean the data source files. If set, this flag will cause the data + source files to be re-downloaded prior to processing. ``` -*1 ==> DIP* -*2 ==> HPRD_ID_MAPPINGS* -*3 ==> TRANSFAC_GENE* -*4 ==> TRANSFAC_MATRIX* -*5 ==> GAD* -6 ==> PHARMGKB_DISEASE -7 ==> PHARMGKB_GENE -*8 ==> PHARMGKB_RELATION* -9 ==> PHARMGKB_DRUG -10 ==> DRUGBANK -11 ==> HGNC -12 ==> HOMOLOGENE -13 ==> IREFWEB -14 ==> MGI_ENTREZGENE -15 ==> MGI_MGIPHENOGENO -16 ==> MGI_MRKLIST -17 ==> MGI_MRKREFERENCE -18 ==> MGI_MRKSEQUENCE -19 ==> MGI_MRKSWISSPROT -20 ==> MIRBASE -*21 ==> OMIM* -22 ==> RGD_GENES -23 ==> RGD_GENE_MP -24 ==> RGD_GENE_RDO -25 ==> RGD_GENE_NBO -26 ==> RGD_GENE_PW -27 ==> PREMOD_HUMAN -28 ==> PREMOD_MOUSE -29 ==> PR_MAPPINGFILE -30 ==> REACTOME_UNIPROT2PATHWAYSTID -31 ==> REFSEQ_RELEASECATALOG -32 ==> NCBIGENE_GENE2REFSEQ -33 ==> NCBIGENE_GENEINFO -34 ==> NCBIGENE_MIM2GENE -35 ==> NCBIGENE_REFSEQUNIPROTCOLLAB -36 ==> GOA -37 ==> UNIPROT_SWISSPROT -38 ==> UNIPROT_IDMAPPING -39 ==> UNIPROT_TREMBL_SPARSE -40 ==> INTERPRO_NAMESDAT -41 ==> INTERPRO_INTERPRO2GO -42 ==> INTERPRO_PROTEIN2IPR +Data source files that are publicly available will be automatically downloaded and saved under +the directory specified by the `-d` parameter. Data source files that require manual download +must be manually placed under the directory specified by the `-d` parameter prior to RDF generation. +Data source names that can be used as input to the `-i` parameter in the `download-datasources-and-generate-triples.sh` +script are listed in the above +table in the "RDF Generation Key" column. They can also be seen by running the following script: ``` +datasource-rdfizer/scripts/list-datasource-names.sh +``` + +## Example RDF Generation -While this is very convenient when dealing with some job schedulers, it also -allows for easy execution of single RDF generation jobs. For example, to -generate RDF for the MirBase database file (index = 20): +#### miRBase RDF Generation +For example, to generate RDF for the MirBase database file: ``` $ export DATA_DIR=[BASE_DIRECTORY_WHERE_DATA_FILES_TO_PARSE_LIVE] @@ -121,10 +163,10 @@ $ mkdir -p $DATA_DIR $ mkdir -p $RDF_DIR $ export DATE=[TODAYS_DATE_TO_TIMESTAMP_THE_DATA e.g. 2015-04-16] $ mvn clean install -$ ./datasource-rdfizer/scripts/download-ddatasources-and-generate-triples \ +$ ./datasource-rdfizer/scripts/download-datasources-and-generate-triples \ -d $DATA_DIR \ -r $RDF_DIR \ - -i 20 + -i MIRBASE ``` Note: you may need to adjust the Java Heap size in pom-rdf-gen.xml depending on @@ -137,28 +179,32 @@ group of species. Doing so can improve RDF generation time as well as limit the number of triples produced when parsing a file. Some of the file parsers are *species-aware* and the script allows one to specify the NCBI taxonomy ID of the species to which triple generation should be constrained. For example, -to limit RDF triples only to humans (NCBI taxonomy ID: 9606): +to constrain output to UniProt ID mapping records that pertain only to human +(NCBI taxonomy ID: 9606), run: ``` -./datasource-rdfizer/scripts/download-ddatasources-and-generate-triples \ +./datasource-rdfizer/scripts/download-datasources-and-generate-triples \ -d $DATA_DIR \ -r $RDF_DIR \ - -i 20 + -i UNIPROT_IDMAPPING -t 9606 ``` For human plus seven model organisms (fly, rat, mouse, yeast, worm, -arabidopsis, and zebrafish), use: +arabidopsis, and zebrafish), use the `-m` parameter: ``` -./datasource-rdfizer/scripts/download-ddatasources-and-generate-triples \ +./datasource-rdfizer/scripts/download-datasources-and-generate-triples \ -d $DATA_DIR \ -r $RDF_DIR \ - -i 20 + -i UNIPROT_IDMAPPING -m ``` -When a taxon-aware file parser is used, some extra data is downloaded to ensure +_Note: when a taxon-aware file parser is used, some extra data is downloaded to ensure that the mappings from biological concepts to taxon identifiers are present. This download can be time consuming due to one of the files being very -large, but it is a one-time cost. \ No newline at end of file +large, but it is a one-time cost._ + + + From eb7074cfee24772272432d9449fe3f1a3b33cd30 Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Thu, 11 Feb 2016 14:31:13 -0700 Subject: [PATCH 35/36] updated compiler version to 1.8 --- datasource-rdfizer/scripts/pom-rdf-gen-9606.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-gen.xml | 4 ++-- datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml b/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml index 915dc97..4be5962 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-9606.xml @@ -54,8 +54,8 @@ maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 diff --git a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml index a805709..72b046a 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen-modelorgs.xml @@ -54,8 +54,8 @@ maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 diff --git a/datasource-rdfizer/scripts/pom-rdf-gen.xml b/datasource-rdfizer/scripts/pom-rdf-gen.xml index e0985d6..1712ba4 100644 --- a/datasource-rdfizer/scripts/pom-rdf-gen.xml +++ b/datasource-rdfizer/scripts/pom-rdf-gen.xml @@ -64,8 +64,8 @@ maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 diff --git a/datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml b/datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml index d4bc319..b485fe3 100644 --- a/datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml +++ b/datasource-rdfizer/scripts/pom-rdf-list-datasource-names.xml @@ -45,8 +45,8 @@ maven-compiler-plugin 3.3 - 1.7 - 1.7 + 1.8 + 1.8 From bd9af86289ebce04dbb4b2ad7b942d82ab15c5df Mon Sep 17 00:00:00 2001 From: bill-baumgartner Date: Thu, 11 Feb 2016 14:33:47 -0700 Subject: [PATCH 36/36] updating poms for branch'release/0.6.1' with non-snapshot versions --- datasource-fileparsers/pom.xml | 2 +- datasource-identifiers/pom.xml | 2 +- datasource-rdfizer/pom.xml | 2 +- pom.xml | 7 ++----- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/datasource-fileparsers/pom.xml b/datasource-fileparsers/pom.xml index 8cffda9..90aceac 100644 --- a/datasource-fileparsers/pom.xml +++ b/datasource-fileparsers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.6.1-SNAPSHOT + 0.6.1 datasource-fileparsers diff --git a/datasource-identifiers/pom.xml b/datasource-identifiers/pom.xml index c12811c..52498a6 100644 --- a/datasource-identifiers/pom.xml +++ b/datasource-identifiers/pom.xml @@ -3,7 +3,7 @@ datasource edu.ucdenver.ccp - 0.6.1-SNAPSHOT + 0.6.1 datasource-identifiers diff --git a/datasource-rdfizer/pom.xml b/datasource-rdfizer/pom.xml index 728b501..5430d2b 100644 --- a/datasource-rdfizer/pom.xml +++ b/datasource-rdfizer/pom.xml @@ -3,7 +3,7 @@ edu.ucdenver.ccp datasource - 0.6.1-SNAPSHOT + 0.6.1 datasource-rdfizer diff --git a/pom.xml b/pom.xml index 028577b..3f6f3ec 100644 --- a/pom.xml +++ b/pom.xml @@ -1,12 +1,9 @@ - + 4.0.0 edu.ucdenver.ccp datasource - 0.6.1-SNAPSHOT + 0.6.1 pom