Skip to content

Commit

Permalink
Added handling for non-normalized (no URI) and erroneous identifiers
Browse files Browse the repository at this point in the history
Instead of being excluded from the output RDF they are now cataloged as
either NonNormalizedIdentifierRecords or ErroneousIdentifierRecords.
  • Loading branch information
bill-baumgartner committed Feb 8, 2016
1 parent eca2ca9 commit f85466a
Show file tree
Hide file tree
Showing 8 changed files with 771 additions and 270 deletions.
Original file line number Diff line number Diff line change
@@ -1,5 +1,39 @@
package edu.ucdenver.ccp.datasource.identifiers;

/*
* #%L
* Colorado Computational Pharmacology's datasource
* project
* %%
* Copyright (C) 2012 - 2016 Regents of the University of Colorado
* %%
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* #L%
*/

public class ProbableErrorDataSourceIdentifier extends DataSourceIdentifier<String> {

private final String dataSourceStr;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,39 @@
package edu.ucdenver.ccp.datasource.identifiers;

/*
* #%L
* Colorado Computational Pharmacology's datasource
* project
* %%
* Copyright (C) 2012 - 2016 Regents of the University of Colorado
* %%
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* #L%
*/

public class UnknownDataSourceIdentifier extends DataSourceIdentifier<String> {

private final String dataSourceStr;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice;

/*
* #%L
* Colorado Computational Pharmacology's datasource
* project
* %%
* Copyright (C) 2012 - 2016 Regents of the University of Colorado
* %%
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* #L%
*/

import edu.ucdenver.ccp.datasource.fileparsers.Record;
import edu.ucdenver.ccp.datasource.fileparsers.RecordField;
import edu.ucdenver.ccp.datasource.identifiers.DataSource;

@Record(dataSource = DataSource.KABOB)
public class ErroneousIdentifierRecord {

@RecordField
private final String identifier;

@RecordField
private final String datasource;

@RecordField
private final String comment;

public ErroneousIdentifierRecord(String identifier, String datasource, String comment) {
super();
this.identifier = identifier;
this.datasource = datasource;
this.comment = comment;
}

public String getIdentifier() {
return identifier;
}

public String getDatasource() {
return datasource;
}

public String getComment() {
return comment;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package edu.ucdenver.ccp.datasource.rdfizer.rdf.ice;

/*
* #%L
* Colorado Computational Pharmacology's datasource
* project
* %%
* Copyright (C) 2012 - 2016 Regents of the University of Colorado
* %%
* Redistribution and use in source and binary forms, with or without modification,
* are permitted provided that the following conditions are met:
*
* 1. Redistributions of source code must retain the above copyright notice, this
* list of conditions and the following disclaimer.
*
* 2. Redistributions in binary form must reproduce the above copyright notice,
* this list of conditions and the following disclaimer in the documentation
* and/or other materials provided with the distribution.
*
* 3. Neither the name of the Regents of the University of Colorado nor the names of its contributors
* may be used to endorse or promote products derived from this software without
* specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
* INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
* DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
* OF THE POSSIBILITY OF SUCH DAMAGE.
* #L%
*/

import edu.ucdenver.ccp.datasource.fileparsers.Record;
import edu.ucdenver.ccp.datasource.fileparsers.RecordField;
import edu.ucdenver.ccp.datasource.identifiers.DataSource;

@Record(dataSource = DataSource.KABOB)
public class NonNormalizedIdentifierRecord {

@RecordField
private final String identifier;

@RecordField
private final String datasource;

public NonNormalizedIdentifierRecord(String identifier, String datasource) {
super();
this.identifier = identifier;
this.datasource = datasource;
}

public String getIdentifier() {
return identifier;
}

public String getDatasource() {
return datasource;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,19 @@
import java.util.Map.Entry;
import java.util.Set;

import org.openrdf.model.Statement;
import org.openrdf.model.Value;
import org.openrdf.model.impl.URIImpl;
import org.openrdf.rio.ntriples.NTriplesUtil;

import edu.ucdenver.ccp.common.collections.CollectionsUtil;
import edu.ucdenver.ccp.common.digest.DigestUtil;
import edu.ucdenver.ccp.common.reflection.PrivateAccessor;
import edu.ucdenver.ccp.datasource.fileparsers.RecordField;
import edu.ucdenver.ccp.datasource.fileparsers.RecordUtil;
import edu.ucdenver.ccp.datasource.identifiers.DataSource;
import edu.ucdenver.ccp.datasource.identifiers.ProbableErrorDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.identifiers.UnknownDataSourceIdentifier;
import edu.ucdenver.ccp.datasource.rdfizer.rdf.vocabulary.KIAO;

/**
Expand Down Expand Up @@ -178,6 +182,30 @@ private static List<String> getSortedFieldValueUriStrs(Collection<Object> fieldV
* could be a collection, if so we return one string per value
*/
private static String getFieldValueUri(Object fieldValue) {
/* address unknown and probable error data source identifiers here? */
if (fieldValue instanceof UnknownDataSourceIdentifier) {
UnknownDataSourceIdentifier id = (UnknownDataSourceIdentifier) fieldValue;
NonNormalizedIdentifierRecord record = new NonNormalizedIdentifierRecord(id.getDataElement(), id.getDataSourceStr());
URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record);
List<Statement> recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, System.currentTimeMillis(),
recordUri, null, null, null);
recordInstanceStatements.remove(0);
/* this is used to generate sha1 hashes, so it doesn't need to be a true uri */
return CollectionsUtil.createDelimitedString(recordInstanceStatements, " ");
} else if (fieldValue instanceof ProbableErrorDataSourceIdentifier) {
ProbableErrorDataSourceIdentifier id = (ProbableErrorDataSourceIdentifier) fieldValue;
ErroneousIdentifierRecord record = new ErroneousIdentifierRecord(id.getDataElement(),
id.getDataSourceStr(), id.getErrorMessage());
URIImpl recordUri = RdfRecordUriFactory.createRecordUri(record);
List<Statement> recordInstanceStatements = RdfRecordUtil.getRecordInstanceStatements(record, System.currentTimeMillis(),
recordUri, null, null, null);
/*
* the first statement returned is a dataset has_part record triple
* which we do not need
*/
recordInstanceStatements.remove(0);
return CollectionsUtil.createDelimitedString(recordInstanceStatements, " ");
}
Value value = RdfUtil.getValue(fieldValue);
return NTriplesUtil.toNTriplesString(value);
}
Expand Down Expand Up @@ -224,7 +252,6 @@ private static Collection<Object> getFieldValues(Object record, Field field) {
return null;
}

int fieldCount = 0;
Collection<Object> fieldValues = new ArrayList<Object>();

if (!(fieldValue instanceof Collection)) {
Expand Down
Loading

0 comments on commit f85466a

Please sign in to comment.