From 0767dd7264682f6fb459a595378b8be044396fd9 Mon Sep 17 00:00:00 2001 From: jrhoads Date: Wed, 27 Apr 2022 14:18:41 -0400 Subject: [PATCH 1/2] Raise exceptions if the identifiers are missing key information --- tests/unit/test_catalogs.py | 25 ++++++++++++++++++++++--- viringo/catalogs.py | 15 +++++++++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/tests/unit/test_catalogs.py b/tests/unit/test_catalogs.py index 1cc8e2a..fdf3d6f 100644 --- a/tests/unit/test_catalogs.py +++ b/tests/unit/test_catalogs.py @@ -1,5 +1,6 @@ """Unit tests for the OAI-PMH DataCite Catalog implementation""" +import pytest from viringo import catalogs def test_set_to_search_query(): @@ -62,9 +63,7 @@ def test_identifier_to_string(): 'type': 'DOI', 'identifier': "10.5072/1234" } - identifier_string = catalogs.identifier_to_string(identifier_type) - assert identifier_string == "doi:10.5072/1234" def test_numeric_identifier_to_string(): @@ -74,7 +73,27 @@ def test_numeric_identifier_to_string(): 'type': 'ISBN', 'identifier': 9783851254679 } - identifier_string = catalogs.identifier_to_string(identifier_type) + assert identifier_string == "isbn:9783851254679" +def test_none_identifier_to_string(): + """Tests for converting an ISBN numeric identifier to single string""" + + identifier_type = { + 'type': "ABC", + 'identifier': None + } + identifier_string = catalogs.identifier_to_string(identifier_type) assert identifier_string == "isbn:9783851254679" + +def test_none_type_identifier_to_string(): + """Tests for converting an ISBN numeric identifier to single string""" + + identifier_type = { + 'type': None, + 'identifier': 9783851254679 + } + with pytest.raises(catalogs.InvalidIdentifierException) as exc: + identifier_string = catalogs.identifier_to_string(identifier_type) + assert exc.type == catalogs.InvalidIdentifierException + diff --git a/viringo/catalogs.py b/viringo/catalogs.py index 514f032..5afb68f 100644 --- a/viringo/catalogs.py +++ b/viringo/catalogs.py @@ -14,6 +14,10 @@ from .services import datacite +class InvalidIdentifierException(Exception): + pass + + class DataCiteOAIServer(): """Build OAI-PMH data responses for DataCite metadata catalog""" @@ -253,12 +257,15 @@ def build_metadata_map(self, result): rights.append(right['uri']) identifiers = [ - identifier_to_string(identifier) for identifier in result.identifiers + identifier_to_string(identifier) + for identifier in result.identifiers + if 'type' in identifier ] relations = [ identifier_to_string(relation) for relation in result.relations + if 'type' in relation ] contributors = [ @@ -328,5 +335,9 @@ def set_to_provider_client(unparsed_set): def identifier_to_string(identifier): """Take an identifier and return in a formatted in single string""" _id = identifier.get('identifier') - _type = identifier.get('type', '') + _type = identifier.get('type','') + if not _type: + raise InvalidIdentifierException("Missing 'type' information") + if not _id: + raise InvalidIdentifierException("Missing 'id' information") return _type.lower() + ":" + str(_id) From 675d7da152b6a035f5c3f3fe5289362a069c9fbc Mon Sep 17 00:00:00 2001 From: jrhoads Date: Wed, 27 Apr 2022 16:36:44 -0400 Subject: [PATCH 2/2] Refactor identifier preparation and skip any poorly defined identifiers --- .github/workflows/pull_reqest.yml | 2 +- tests/unit/test_catalogs.py | 5 ++- tests/unit/test_service_datacite.py | 38 ++++++++++++++++ viringo/services/datacite.py | 68 +++++++++++++++-------------- 4 files changed, 77 insertions(+), 36 deletions(-) diff --git a/.github/workflows/pull_reqest.yml b/.github/workflows/pull_reqest.yml index c47817d..df72641 100644 --- a/.github/workflows/pull_reqest.yml +++ b/.github/workflows/pull_reqest.yml @@ -12,7 +12,7 @@ jobs: - name: Build the stack run: docker compose -f "docker-compose.yml" up -d - name: Run Tests - run: docker compose -f "docker-compose.yml" run web ./bin/ci.sh + run: docker compose -f "docker-compose.yml" exec web ./bin/ci.sh - name: Stop containers if: always() run: docker compose -f "docker-compose.yml" down diff --git a/tests/unit/test_catalogs.py b/tests/unit/test_catalogs.py index fdf3d6f..3765ce9 100644 --- a/tests/unit/test_catalogs.py +++ b/tests/unit/test_catalogs.py @@ -83,8 +83,9 @@ def test_none_identifier_to_string(): 'type': "ABC", 'identifier': None } - identifier_string = catalogs.identifier_to_string(identifier_type) - assert identifier_string == "isbn:9783851254679" + with pytest.raises(catalogs.InvalidIdentifierException) as exc: + identifier_string = catalogs.identifier_to_string(identifier_type) + assert exc.type == catalogs.InvalidIdentifierException def test_none_type_identifier_to_string(): """Tests for converting an ISBN numeric identifier to single string""" diff --git a/tests/unit/test_service_datacite.py b/tests/unit/test_service_datacite.py index 74b4ed0..49700bf 100644 --- a/tests/unit/test_service_datacite.py +++ b/tests/unit/test_service_datacite.py @@ -13,3 +13,41 @@ def test_strip_uri_prefix(): identifier = datacite.strip_uri_prefix("") assert identifier == "" + +def test_identifiers(): + SAMPLE_ATTRIBUTES = { + 'identifiers': [ + { + 'identifierType': 'ISNI', + 'identifier': 9783851254679 + }, + ] + } + + assert datacite.identifiers_from_attributes( SAMPLE_ATTRIBUTES ) == [ + { + 'type': 'ISNI', + 'identifier': '9783851254679' + } + ] + +def test_skip_invlaid_identifiers(): + INVALID_ATTRIBUTES = { + 'identifiers': [ + { + 'identifierType': 'ISBN', + 'identifier': "XXXXX" + }, + { + 'identifierType': None, + 'identifier': 9783851254679 + }, + ] + } + + assert datacite.identifiers_from_attributes( INVALID_ATTRIBUTES ) == [ + { + 'type': 'ISBN', + 'identifier': 'XXXXX' + } + ] diff --git a/viringo/services/datacite.py b/viringo/services/datacite.py index 98d4b4e..aa88678 100644 --- a/viringo/services/datacite.py +++ b/viringo/services/datacite.py @@ -114,12 +114,10 @@ def build_metadata(data): type_ = date.get('dateType') value = date.get('date') if bool(type_) and bool(value) : - result.dates.append( - { - 'type': type_, - 'date': value - } - ) + result.dates.append({ + 'type': type_, + 'date': value + }) result.contributors = attributes.get('contributors',[]) result.funding_references = attributes.get('fundingReferences',[]) @@ -133,37 +131,14 @@ def build_metadata(data): if attributes['types'].get('resourceType') is not None else [] result.formats = attributes.get('formats', []) - - result.identifiers = [] - - # handle missing identifiers attribute - for identifier in attributes.get('identifiers',[]): - if identifier['identifier']: - # Special handling for the fact the API could return bad identifier - # as a list rather than a string - if isinstance(identifier['identifier'], list): - identifier['identifier'] = ','.join( - identifier['identifier']) - - result.identifiers.append({ - 'type': identifier['identifierType'], - 'identifier': strip_uri_prefix(identifier['identifier']) - }) - + result.identifiers = identifiers_from_attributes(attributes) result.language = attributes.get('language', '') - - result.relations = [] - for related in attributes.get('relatedIdentifiers',[]): - if 'relatedIdentifier' in related: - result.relations.append({ - 'type': related['relatedIdentifierType'], - 'identifier': related['relatedIdentifier'] - }) + result.relations = relations_from_attributes(attributes) result.rights = [ { - 'statement': right.get('rights', None), - 'uri': right.get('rightsUri', None) + 'statement': right.get('rights'), + 'uri': right.get('rightsUri') } for right in attributes.get('rightsList', []) ] @@ -181,6 +156,33 @@ def build_metadata(data): return result +def relations_from_attributes(attributes): + return identifiers_from_attributes(attributes, + list_key='relatedIdentifiers', + type_key='relatedIdentifierType', + value_key='relatedIdentifier') + +def identifiers_from_attributes(attributes, + list_key='identifiers', + type_key='identifierType', + value_key='identifier'): + identifiers = [] + for identifier in attributes.get(list_key,[]): + type_ = identifier.get(type_key) + value = identifier.get(value_key) + if bool(type_) and bool(value) : + if isinstance(value, list): + value = ','.join( value ) + if value_key == 'identifier': + value = strip_uri_prefix(value) + identifiers.append({ + 'type': type_, + 'identifier': value + }) + return identifiers + + + def strip_uri_prefix(identifier): """Strip common prefixes because OAI doesn't work with those kind of ID's"""