Skip to content
This repository has been archived by the owner on Dec 18, 2019. It is now read-only.

Commit

Permalink
Merge branch 'release-v29.1'
Browse files Browse the repository at this point in the history
  • Loading branch information
Mark Breedlove committed Jul 14, 2014
2 parents b3b88d9 + 17616bd commit 1ac9da7
Show file tree
Hide file tree
Showing 6 changed files with 191 additions and 45 deletions.
46 changes: 18 additions & 28 deletions lib/akamod/enrich_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from dateutil.parser import parse as dateutil_parse
from zen import dateparser
from dplaingestion.selector import getprop, setprop, delprop, exists
from dplaingestion.utilities import iterify
from dplaingestion.utilities import iterify, clean_date, \
remove_brackets_and_strip

HTTP_INTERNAL_SERVER_ERROR = 500
HTTP_TYPE_JSON = 'application/json'
Expand Down Expand Up @@ -232,10 +233,6 @@ def parse_date_or_range(d):

return a, b

def remove_brackets_and_strip(d):
"""Removed brackets from the date (range)."""
return d.replace("[", "").replace("]", "").strip()

def test_parse_date_or_range():
DATE_TESTS = {
"ca. July 1896": ("1896-07", "1896-07"), # fuzzy dates
Expand All @@ -254,14 +251,6 @@ def test_parse_date_or_range():
res = parse_date_or_range(i)
assert res == DATE_TESTS[i], "For input '%s', expected '%s' but got '%s'"%(i,DATE_TESTS[i],res)

def clean_date(d):
regex = [("\s*to\s*|\s[-/]\s", "-"), ("[\?\(\)]|\s*ca\.?\s*|~|x", "")]
if not "circa" in d and not "century" in d:
regex.append(("\s*c\.?\s*", ""))
for p, r in regex:
d = re.sub(p, r, d)
return d.strip()

def convert_dates(data, prop, earliest):
"""Converts dates.
Expand All @@ -274,11 +263,10 @@ def convert_dates(data, prop, earliest):
Returns:
Nothing, the replacement is done in place.
"""
dates = []
for p in prop.split(','):
dates = []
if exists(data, p):
v = getprop(data, p)

if not isinstance(v, dict):
for s in (v if not isinstance(v, basestring) else [v]):
for part in s.split(";"):
Expand All @@ -287,24 +275,26 @@ def convert_dates(data, prop, earliest):
if len(stripped) < 4:
continue
a, b = parse_date_or_range(stripped)
if b != '3000-01-01':
if b != DEFAULT_DATETIME_STR:
dates.append( {
"begin": a,
"end": b,
"displayDate" : display_date
})

dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)

value_to_set = dates
if earliest and dates:
value_to_set = dates[0]

if value_to_set:
setprop(data, p, value_to_set)
else:
if exists(data, p):
delprop(data, p)
else:
# Already filled in, probably by mapper
continue

dates.sort(key=lambda d: d["begin"] if d["begin"] is not None
else DEFAULT_DATETIME_STR)
if dates:
if earliest:
value_to_set = dates[0]
else:
value_to_set = dates
setprop(data, p, value_to_set)
else:
delprop(data, p)

def check_date_format(data, prop):
"""Checks that the begin and end dates are in the proper format"""
Expand Down
131 changes: 127 additions & 4 deletions lib/mappers/marc_mapper.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
import datetime
from dplaingestion.utilities import iterify
from dplaingestion.selector import exists, setprop, delprop
from dplaingestion.selector import getprop
from collections import OrderedDict
from dplaingestion.mappers.mapper import Mapper
from dplaingestion.utilities import strip_unclosed_brackets

class MARCMapper(Mapper):

Expand Down Expand Up @@ -41,7 +43,7 @@ def __init__(self, provider_data, key_prefix=None):
self.mapping_dict = {
lambda t: t == "856": [(self.map_is_shown_at, "u")],
lambda t: t == "041": [(self.map_language, "a")],
lambda t: t == "260": [(self.map_date, "c"),
lambda t: t == "260": [(self.map_display_date, "c"),
(self.map_publisher, "ab")],

lambda t: t == "300": [(self.map_extent, "ac")],
Expand Down Expand Up @@ -224,6 +226,19 @@ def _get_values(self, _dict, codes):

return values

def _get_one_subfield(self, _dict, code):
"""Get one MARC subfield having the given code
_dict: a dictionary of one element of the "datafield" list
code: one MARC subfield character code
"""
try:
subfields = [sf["#text"] for sf in self._get_subfields(_dict)
if sf["code"] == code]
return subfields[0] # assume there's just one
except (KeyError, IndexError):
return None

def _get_subject_values(self, _dict, tag):
"""
Extracts the "#text" values from _dict for the subject field and
Expand Down Expand Up @@ -296,9 +311,16 @@ def map_language(self, _dict, tag, codes):
prop = "sourceResource/language"
self.extend_prop(prop, _dict, codes)

def map_date(self, _dict, tag, codes):
prop = "sourceResource/date"
self.extend_prop(prop, _dict, codes)
def map_display_date(self, _dict, tag, code):
"""Map what will be the displayDate to sourceResource/date.
This will be further processed down the pipeline, or recreated as
a dictionary by the Control Field 008 mapping.
"""
date_given = self._get_one_subfield(_dict, code) or ""
semi_stripped = date_given.strip(";. ")
date = strip_unclosed_brackets(semi_stripped)
self.mapped_data["sourceResource"]["date"] = date

def map_publisher(self, _dict, tag, codes):
prop = "sourceResource/publisher"
Expand Down Expand Up @@ -507,7 +529,103 @@ def map_datafield_tags(self):
func, index, codes = func_tuple
func(_dict, tag, index, codes)

### MARC control field 008 date-parsing functions
# http://www.loc.gov/marc/archive/2000/concise/ecbd008s.html

def cf8_multiple_dates(self, s):
"""Begin and end dates for MARC control field 008, Type of Date "m" """
begin = s[7:11]
end = s[11:15]
return (begin, end)

def cf8_detailed_date(self, s):
"""Begin and end dates for MARC control field 008, Type of Date "e"
Since this contains one date, begin and end are the same.
"""
year = s[7:11]
month = s[11:13]
day = s[13:15]
date = "%s-%s-%s" % (year, month, day)
return (date, date)

def cf8_single_date(self, s):
"""Begin and end dates for MARC control field 008, Type of Date "s"
Since this contains one date, begin and end are the same.
"""
year = s[7:11]
return (year, year)

def cf8_reissue_date(self, s):
"""Begin and end dates for MARC control field 008, Type of Date "r"
Reissue date contains date reissued, and original date, if known.
Use the reissue date for both begin and end, because we're representing
one date.
"""
year = s[7:11]
return (year, year)

def cf8_pub_copy_date(self, s):
"""Begin and end dates for MARC control field 008, Type of Date "t"
Publication and copyright date. We only represent the publication
date.
"""
year = s[7:11]
return (year, year)

def cf8_serial_item_current(self, s):
"""Begin and end dates, MARC control field 008, type "c"
Serial item in current publication
"""
begin = s[7:11]
# The MARC spec says the end year is supposed to be "9999", but I've
# seen otherwise, and the current year looks better. Since "9999" is
# a bogus value, anyway, I'm using the current year.
end = str(datetime.datetime.today().year)
return (begin, end)

def cf8_serial_item_ceased_pub(self, s):
"""Begin and end dates, MARC control field 008, type "d"
Serial item that has ceased publication
"""
begin = s[7:11]
end = s[11:15]
return (begin, end)

def display_date_for_none_given(self, begin, end):
"""Construct a display date if none was given in subfield 260"""
if begin != end:
return "%s-%s" % (begin, end)
else:
return begin

def set_begin_end_dates(self, begin, end):
"""Given begin and end, set sourceResource/date properties"""
display_date = getprop(self.mapped_data, "sourceResource/date", True)
date = {
"displayDate": display_date or \
self.display_date_for_none_given(begin, end),
"begin": begin,
"end": end
}
setprop(self.mapped_data, "sourceResource/date", date)

def map_controlfield_tags(self):
date_func = {
"m": "cf8_multiple_dates",
"q": "cf8_multiple_dates",
"s": "cf8_single_date",
"e": "cf8_detailed_date",
"r": "cf8_reissue_date",
"t": "cf8_pub_copy_date",
"d": "cf8_serial_item_ceased_pub",
"c": "cf8_serial_item_current"
}
for item in iterify(getprop(self.provider_data, "controlfield")):
if "#text" in item and "tag" in item:
if item["tag"] == "001":
Expand All @@ -521,6 +639,11 @@ def map_controlfield_tags(self):
pass
elif item["tag"] == "008":
text = item["#text"]
type_of_date = text[6]
if type_of_date in date_func:
f = getattr(self, date_func[type_of_date])
(begin, end) = f(text)
self.set_begin_end_dates(begin, end)
if len(text) > 18:
self.control_008_18 = text[18]
if len(text) > 21:
Expand Down
17 changes: 17 additions & 0 deletions lib/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import sys
import time
import tarfile
import re
from functools import wraps

def iterify(iterable):
Expand Down Expand Up @@ -94,3 +95,19 @@ def pause(attempt):
return func_with_retries

return apply_with_retries

def clean_date(d):
"""Return a given date string without certain characters and expressions"""
regex = [("\s*to\s*|\s[-/]\s", "-"), ("[\?\(\)]|\s*ca\.?\s*|~|x", "")]
if not "circa" in d and not "century" in d:
regex.append(("\s*c\.?\s*", ""))
for p, r in regex:
d = re.sub(p, r, d)
return d.strip()

def remove_brackets_and_strip(d):
"""Return a given date-range string without square brackets"""
return d.replace("[", "").replace("]", "").strip(". ")

def strip_unclosed_brackets(s):
return re.sub(r'\[(?![^\]]*?\])', '', s)
3 changes: 3 additions & 0 deletions profiles/artstor.pjs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
"type": "oai_verbs",
"endpoint_url": "http://oai.artstor.org/oaicatmuseum/OAIHandler",
"sets": [
"SSDPLABrynMawr",
"SSDPLACornell",
"SSDPLAUCSD",
"SSDPLAWashington",
"SSDelwareAtlas",
"SSDelwareGeorge",
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from distutils.core import setup

setup( name = 'ingestion',
version = '29.0',
version = '29.1',
description='DPLA Ingestion Subsystem',
author='Digital Public Library of America',
author_email='[email protected]',
Expand Down
37 changes: 25 additions & 12 deletions test/test_enrich_date.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,18 +391,31 @@ def test_enrich_date_parse_format_date_range4():

def test_enrich_date_parse_century_date():
"""Correctly transform a date of format '19th c.'"""
INPUT = ["19th c.", "19th century"]

url = server() + "enrich_earliest_date?prop=date"

for i in INPUT:
input = {"date": i}
resp,content = H.request(url,"POST",body=json.dumps(input))
assert str(resp.status).startswith("2")

result = json.loads(content)
expected = {"date": {"begin": None, "end": None, "displayDate": i}}
assert result['date'] == expected[u'date'], "%s != %s" % (result['date'], expected[u'date'])
INPUT = {"date": "19th c."}
EXPECTED = {
"date": {
"begin": None,
"end": None,
"displayDate": "19th c" # period stripped assumed OK
}
}
resp,content = H.request(url,"POST",body=json.dumps(INPUT))
result = json.loads(content)
assert result["date"] == EXPECTED["date"], \
"%s != %s" % (result["date"], EXPECTED["date"])
INPUT = {"date": "19th century"}
EXPECTED = {
"date": {
"begin": None,
"end": None,
"displayDate": "19th century"
}
}
resp,content = H.request(url,"POST",body=json.dumps(INPUT))
result = json.loads(content)
assert result["date"] == EXPECTED["date"], \
"%s != %s" % (result["date"], EXPECTED["date"])


def test_enrich_date_parse_century_date_with_P():
Expand All @@ -414,7 +427,7 @@ def test_enrich_date_parse_century_date_with_P():
u'date' : {
u'begin' : None,
u'end' : None,
"displayDate" : "19th c."
u"displayDate" : u"19th c"
}
}

Expand Down

0 comments on commit 1ac9da7

Please sign in to comment.