Skip to content
This repository has been archived by the owner on Dec 18, 2019. It is now read-only.

Commit

Permalink
Merge branch 'release-v33.2.3'
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark A. Matienzo committed Apr 13, 2015
2 parents 5dc444e + 09feb3f commit 4f41775
Show file tree
Hide file tree
Showing 15 changed files with 245 additions and 33 deletions.
4 changes: 4 additions & 0 deletions akara.conf.template
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ MODULES = [
"dplaingestion.akamod.move_date_values",
"dplaingestion.akamod.enrich_location",
"dplaingestion.akamod.lookup",
"dplaingestion.akamod.indiana_identify_object",
"dplaingestion.akamod.kentucky_identify_object",
"dplaingestion.akamod.georgia_identify_object",
"dplaingestion.akamod.bhl_contributor_to_collection",
Expand Down Expand Up @@ -231,6 +232,9 @@ class identify_object:
class contentdm_identify_object(identify_object):
pass

class indiana_identify_object(identify_object):
pass

class kentucky_identify_object(identify_object):
pass

Expand Down
9 changes: 8 additions & 1 deletion lib/akamod/enrich-type.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ def enrichtype(body, ctype,
action="enrich-type",
prop="sourceResource/type",
format_field="sourceResource/format",
default=None):
default=None,
send_rejects_to_format=False):
"""
Service that accepts a JSON document and enriches the "type" field of that
document by:
Expand Down Expand Up @@ -74,5 +75,11 @@ def enrichtype(body, ctype,
del data['sourceResource']['type']
except:
pass
finally:
if send_rejects_to_format and type_strings:
rej = itemtype.rejects([(type_strings, type_for_type_keyword)])
if rej:
sr_format.extend(rej)
data['sourceResource']['format'] = sr_format

return json.dumps(data)
34 changes: 34 additions & 0 deletions lib/akamod/indiana_identify_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
"""
Pipeline module to assign edm:object for Indiana Memory data
"""

import json
from akara import response
from akara.services import simple_service


@simple_service('POST',
'http://purl.org/la/dp/indiana_identify_object',
'indiana_identify_object',
'application/json')
def indiana_identify_object(body, ctype_ignored):
"""assign edm:object based on dc:source
Per Indiana crosswalk, http://bit.ly/dpla-crosswalks
dc:source lives in originalRecord.source
"""
try:
record = json.loads(body)
record['object'] = record['originalRecord']['source']
return json.dumps(record)
except ValueError:
prepare_error_response()
return 'Unable to parse request body as JSON'
except KeyError:
prepare_error_response()
return 'No originalRecord.source for determining object'

def prepare_error_response():
"""Set HTTP response code and content type for an error"""
response.code = 500
response.add_header('Content-Type', 'text/plain')
2 changes: 1 addition & 1 deletion lib/akamod/move_date_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def cleanup(s):
return "Unable to parse body as JSON"

if exists(data, prop):
values = getprop(data, prop)
values = iterify(getprop(data, prop))
remove = []
toprop = iterify(getprop(data, to_prop)) if exists(data, to_prop) \
else []
Expand Down
10 changes: 10 additions & 0 deletions lib/itemtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,13 @@ def type_for_strings_and_mappings(string_map_combos):
return t
raise NoTypeError

def rejects(string_map_combos):
"""rejects([(list, list_of_tuples), ...])
Given pairs of strings and mapping tuples, as with
type_for_strings_and_mappings, return a list of strings that do _not_
map to valid types.
"""
return [s for strings, mappings in string_map_combos
for s in strings
if not s == _type_for_keyword(s, mappings)]
16 changes: 9 additions & 7 deletions lib/mappers/harvard_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,13 +133,15 @@ def map_extent(self):

def map_description(self):
prop = self.root_key + "note"

desc = getprop(self.provider_data, prop, True)
if isinstance(desc, dict):
desc = desc["#text"] if "#text" in desc else None

if desc:
self.update_source_resource({"description": desc})
out_desc = []
for desc in iterify(getprop(self.provider_data, prop, True)):
if isinstance(desc, dict):
desc = desc["#text"] if "#text" in desc else None
if desc:
out_desc.append(desc)

if out_desc:
self.update_source_resource({"description": out_desc})

def map_format(self):
return super(HarvardMapper, self).map_format(authority_condition=True)
Expand Down
2 changes: 2 additions & 0 deletions lib/mappers/marc_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -611,6 +611,8 @@ def display_date_for_none_given(self, begin, end):
def set_begin_end_dates(self, begin, end):
"""Given begin and end, set sourceResource/date properties"""
display_date = getprop(self.mapped_data, "sourceResource/date", True)
if isinstance(display_date, dict):
display_date = display_date.get("displayDate")
date = {
"displayDate": display_date or \
self.display_date_for_none_given(begin, end),
Expand Down
30 changes: 15 additions & 15 deletions lib/mappers/nypl_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,16 @@ def __init__(self, provider_data):
"technical director", "woodcutter"
]

def txt(self, n):
if not n:
return ""
elif type(n) == dict:
return n.get("#text") or ""
elif isinstance(n, basestring):
return n
else:
return ""

def map_title(self):
prop = "titleInfo"

Expand Down Expand Up @@ -48,24 +58,14 @@ def map_identifier(self):
if identifier:
self.update_source_resource({"identifier": identifier})

def map_description(self):
def txt(n):
if not n:
return ""
elif type(n) == dict:
return n.get("#text") or ""
elif isinstance(n, basestring):
return n
else:
return ""

note = txt(getprop(self.provider_data, "note", True))
def map_description(self):
note = self.txt(getprop(self.provider_data, "note", True))
pd = getprop(self.provider_data, "physicalDescription", True)
pnote = None
if type(pd) == list:
pnote = [e["note"] for e in pd if "note" in e] # Yes, a list.
pnote = [self.txt(e["note"]) for e in pd if "note" in e] # Yes, a list.
elif type(pd) == dict and "note" in pd:
pnote = txt(pd["note"]) # Yes, a string.
pnote = self.txt(pd["note"]) # Yes, a string.

desc = note or pnote
if desc:
Expand Down Expand Up @@ -227,7 +227,7 @@ def datestring(date_data):
# Map publisher
if ("publisher" in origin_info and origin_info["publisher"] not in
ret_dict["publisher"]):
ret_dict["publisher"].append(origin_info["publisher"])
ret_dict["publisher"].append(self.txt(origin_info["publisher"]))
# Map spatial
if exists(origin_info, "place/placeTerm"):
for place_term in iterify(getprop(origin_info,
Expand Down
26 changes: 18 additions & 8 deletions lib/oai.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,8 @@
import sys
import time, logging
import urllib

import urllib2
from amara.pushtree import pushtree
from amara.thirdparty import httplib2
from akara import logger
from dplaingestion.utilities import iterify
import xmltodict
Expand Down Expand Up @@ -106,13 +105,12 @@ class oaiservice(object):
- returns dictionary with keys "records", "resumption_token", and
"error", where records is a list of dictionaries
"""
def __init__(self, root, logger=logging, cachedir='/tmp/.cache'):
def __init__(self, root, logger=logging):
'''
root - root of the OAI service endpoint, e.g. http://dspace.mit.edu/oai/request
'''
self.root = root
self.logger = logger
self.h = httplib2.Http(cachedir)
return

def list_sets(self):
Expand All @@ -121,7 +119,14 @@ def list_sets(self):
url = self.root + '?' + qstr
self.logger.debug('OAI request URL: {0}'.format(url))
start_t = time.time()
resp, content = self.h.request(url)
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError as e:
raise OAIHTTPError("list_sets could not make request: %s" % \
e.reason)
except urllib2.HTTPError as e:
raise OAIHTTPError("list_sets got status %d: %s" % \
(e.code, e.reason))
retrieved_t = time.time()
self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t))
sets = []
Expand Down Expand Up @@ -173,10 +178,15 @@ def list_records(self, set_id=None, resumption_token="", metadataPrefix="",
url = self.root + '?' + qstr
self.logger.debug('OAI request URL: {0}'.format(url))
start_t = time.time()
resp, content = self.h.request(url)
try:
content = urllib2.urlopen(url).read()
except urllib2.URLError as e:
raise OAIHTTPError("list_records could not make request: %s" % \
e.reason)
except urllib2.HTTPError as e:
raise OAIHTTPError("list_records got status %d: %s" % \
(e.code, e.reason))
retrieved_t = time.time()
if resp.status != 200:
raise OAIHTTPError("Status code: %d" % resp.status)
self.logger.debug('Retrieved in {0}s'.format(retrieved_t - start_t))

xml_content = XML_PARSE(content)
Expand Down
46 changes: 46 additions & 0 deletions profiles/indiana.pjs
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
{
"name": "indiana",
"type": "oai_verbs",
"endpoint_url": "https://digital.library.in.gov/OAI/Server",
"contributor": {
"@id": "http://dp.la/api/contributor/indiana",
"name": "Indiana Memory"
},
"enrichments_coll": [
"/set_context",
"/validate_mapv3"
],
"enrichments_item": [
"/select-id",
"/dpla_mapper?mapper_type=dublin_core",
"/strip_html",
"/indiana_identify_object",
"/set_context",
"/move_date_values?prop=sourceResource%2Fspatial&to_prop=sourceResource%2Fdate",
"/shred?prop=sourceResource%2Fsubject%2CsourceResource%2Fspatial%2CsourceResource%2Ftype",
"/cleanup_value",
"/capitalize_value",
"/enrich_earliest_date",
"/enrich-subject",
"/enrich_date",
"/set_spec_type",
"/enrich-type?send_rejects_to_format=true",
"/enrich-format",
"/enrich-type?default=image",
"/enrich_location",
"/geocode",
"/enrich_language",
"/set_prop?prop=sourceResource%2FstateLocatedIn&value=Indiana",
"/enrich_location?prop=sourceResource%2FstateLocatedIn",
"/unset_prop?prop=dataProvider",
"/copy_prop?prop=sourceResource%2Fcontributor&to_prop=dataProvider",
"/unset_prop?prop=sourceResource%2Fcontributor",
"/copy_prop?prop=provider%2Fname&to_prop=dataProvider&skip_if_exists=True",
"/validate_mapv3"
],
"thresholds": {
"added": 5000,
"changed": 1000,
"deleted": 1000
}
}
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from distutils.core import setup

setup( name = 'ingestion',
version = '33.2.2',
version = '33.2.3',
description='DPLA Ingestion System',
author='Digital Public Library of America',
author_email='[email protected]',
Expand Down
4 changes: 4 additions & 0 deletions test/server_support.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,7 @@ class Akara:
"dplaingestion.akamod.move_date_values",
"dplaingestion.akamod.enrich_location",
"dplaingestion.akamod.lookup",
"dplaingestion.akamod.indiana_identify_object",
"dplaingestion.akamod.kentucky_identify_object",
"dplaingestion.akamod.georgia_identify_object",
"dplaingestion.akamod.bhl_contributor_to_collection",
Expand Down Expand Up @@ -215,6 +216,9 @@ class identify_object:
class contentdm_identify_object(identify_object):
pass
class indiana_identify_object(identify_object):
pass
class kentucky_identify_object(identify_object):
pass
Expand Down
48 changes: 48 additions & 0 deletions test/test_enrich_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,5 +78,53 @@ def test_type_for_type_keyword():
assert resp.status == 200
assert_same_jsons(EXPECTED, json.loads(content))

def test_type_set_format():
"""Format gets set correctly given invalid type value
When send_rejects_to_format is true, format should get populated with the
type strings that don't exactly match a valid type.
"""
url = server() + "enrich-type?send_rejects_to_format=true"
INPUT = {
"sourceResource": {
"type": "digital photograph"
}
}
EXPECTED = {
"sourceResource": {
"type": "image",
"format": ["digital photograph"]
}
}
resp, content = H.request(url, "POST", body=json.dumps(INPUT))
assert resp.status == 200
assert_same_jsons(EXPECTED, json.loads(content))
INPUT = {
"sourceResource": {
"type": "text"
}
}
EXPECTED = {
"sourceResource": {
"type": "text"
}
}
resp, content = H.request(url, "POST", body=json.dumps(INPUT))
assert resp.status == 200
assert_same_jsons(EXPECTED, json.loads(content))
INPUT = {
"sourceResource": {
"type": "weird thing"
}
}
EXPECTED = {
"sourceResource": {
"format": ["weird thing"]
}
}
resp, content = H.request(url, "POST", body=json.dumps(INPUT))
assert resp.status == 200
assert_same_jsons(EXPECTED, json.loads(content))

if __name__ == "__main__":
raise SystemExit("Use nosetest")
19 changes: 19 additions & 0 deletions test/test_indiana_identify_object.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import json
from server_support import server, H
from dict_differ import assert_same_jsons


def test_indiana_identify_object_from_source():
"""Indiana object is assigned from source"""
request_data = {
'originalRecord': {'source': 'http://thumbnail/url'}
}
expected_result = {
'originalRecord': {'source': 'http://thumbnail/url'},
'object': 'http://thumbnail/url'
}
url = server() + 'indiana_identify_object'
resp_meta, resp_body = H.request(url, 'POST',
body=json.dumps(request_data))
assert resp_meta.status == 200
assert_same_jsons(expected_result, resp_body)
Loading

0 comments on commit 4f41775

Please sign in to comment.