Merge branch 'release-v29.1'

dpla-attic · Jul 14, 2014 · 1ac9da7 · 1ac9da7
2 parents b3b88d9 + 17616bd
commit 1ac9da7
Show file tree

Hide file tree

Showing 6 changed files with 191 additions and 45 deletions.
diff --git a/lib/akamod/enrich_date.py b/lib/akamod/enrich_date.py
@@ -8,7 +8,8 @@
 from dateutil.parser import parse as dateutil_parse
 from zen import dateparser
 from dplaingestion.selector import getprop, setprop, delprop, exists
-from dplaingestion.utilities import iterify
+from dplaingestion.utilities import iterify, clean_date, \
+                                    remove_brackets_and_strip
 
 HTTP_INTERNAL_SERVER_ERROR = 500
 HTTP_TYPE_JSON = 'application/json'
@@ -232,10 +233,6 @@ def parse_date_or_range(d):
 
     return a, b
 
-def remove_brackets_and_strip(d):
-    """Removed brackets from the date (range)."""
-    return d.replace("[", "").replace("]", "").strip()
-
 def test_parse_date_or_range():
     DATE_TESTS = {
         "ca. July 1896": ("1896-07", "1896-07"), # fuzzy dates
@@ -254,14 +251,6 @@ def test_parse_date_or_range():
         res = parse_date_or_range(i)
         assert res == DATE_TESTS[i], "For input '%s', expected '%s' but got '%s'"%(i,DATE_TESTS[i],res)
 
-def clean_date(d):
-    regex = [("\s*to\s*|\s[-/]\s", "-"), ("[\?\(\)]|\s*ca\.?\s*|~|x", "")]
-    if not "circa" in d and not "century" in d:
-        regex.append(("\s*c\.?\s*", ""))
-    for p, r in regex:
-        d = re.sub(p, r, d)
-    return d.strip()
-
 def convert_dates(data, prop, earliest):
     """Converts dates.
 
@@ -274,11 +263,10 @@ def convert_dates(data, prop, earliest):
     Returns:
     Nothing, the replacement is done in place.
     """
-    dates = []
     for p in prop.split(','):
+        dates = []
         if exists(data, p):
             v = getprop(data, p)
-
             if not isinstance(v, dict):
                 for s in (v if not isinstance(v, basestring) else [v]):
                     for part in s.split(";"):
@@ -287,24 +275,26 @@ def convert_dates(data, prop, earliest):
                         if len(stripped) < 4:
                             continue
                         a, b = parse_date_or_range(stripped)
-                        if b != '3000-01-01':
+                        if b != DEFAULT_DATETIME_STR:
                             dates.append( {
                                     "begin": a,
                                     "end": b,
                                     "displayDate" : display_date
                                 })
-
-    dates.sort(key=lambda d: d["begin"] if d["begin"] is not None else DEFAULT_DATETIME_STR)
-
-    value_to_set = dates
-    if earliest and dates:
-        value_to_set = dates[0]
-
-    if value_to_set:
-        setprop(data, p, value_to_set)
-    else:
-        if exists(data, p):
-            delprop(data, p)
+            else:
+                # Already filled in, probably by mapper
+                continue
+
+            dates.sort(key=lambda d: d["begin"] if d["begin"] is not None
+                                                else DEFAULT_DATETIME_STR)
+            if dates:
+                if earliest:
+                    value_to_set = dates[0]
+                else:
+                    value_to_set = dates
+                setprop(data, p, value_to_set)
+            else:
+                delprop(data, p)
 
 def check_date_format(data, prop):
     """Checks that the begin and end dates are in the proper format"""

diff --git a/lib/mappers/marc_mapper.py b/lib/mappers/marc_mapper.py
@@ -1,9 +1,11 @@
 import re
+import datetime
 from dplaingestion.utilities import iterify
 from dplaingestion.selector import exists, setprop, delprop
 from dplaingestion.selector import getprop
 from collections import OrderedDict
 from dplaingestion.mappers.mapper import Mapper
+from dplaingestion.utilities import strip_unclosed_brackets
 
 class MARCMapper(Mapper):                                                       
 
@@ -41,7 +43,7 @@ def __init__(self, provider_data, key_prefix=None):
         self.mapping_dict = {
             lambda t: t == "856":               [(self.map_is_shown_at, "u")],
             lambda t: t == "041":               [(self.map_language, "a")],
-            lambda t: t == "260":               [(self.map_date, "c"),
+            lambda t: t == "260":               [(self.map_display_date, "c"),
                                                  (self.map_publisher, "ab")],
 
             lambda t: t == "300":               [(self.map_extent, "ac")],
@@ -224,6 +226,19 @@ def _get_values(self, _dict, codes):
 
         return values
 
+    def _get_one_subfield(self, _dict, code):
+        """Get one MARC subfield having the given code
+
+        _dict: a dictionary of one element of the "datafield" list
+        code:  one MARC subfield character code
+        """
+        try:
+            subfields = [sf["#text"] for sf in self._get_subfields(_dict)
+                         if sf["code"] == code]
+            return subfields[0]  # assume there's just one
+        except (KeyError, IndexError):
+            return None
+
     def _get_subject_values(self, _dict, tag):
         """
         Extracts the "#text" values from _dict for the subject field and
@@ -296,9 +311,16 @@ def map_language(self, _dict, tag, codes):
         prop = "sourceResource/language"
         self.extend_prop(prop, _dict, codes)
 
-    def map_date(self, _dict, tag, codes):
-        prop = "sourceResource/date"
-        self.extend_prop(prop, _dict, codes)
+    def map_display_date(self, _dict, tag, code):
+        """Map what will be the displayDate to sourceResource/date.
+
+        This will be further processed down the pipeline, or recreated as
+        a dictionary by the Control Field 008 mapping.
+        """
+        date_given = self._get_one_subfield(_dict, code) or ""
+        semi_stripped = date_given.strip(";. ")
+        date = strip_unclosed_brackets(semi_stripped)
+        self.mapped_data["sourceResource"]["date"] = date
 
     def map_publisher(self, _dict, tag, codes):
         prop = "sourceResource/publisher"
@@ -507,7 +529,103 @@ def map_datafield_tags(self):
                                 func, index, codes = func_tuple
                                 func(_dict, tag, index, codes)
 
+    ### MARC control field 008 date-parsing functions
+    #   http://www.loc.gov/marc/archive/2000/concise/ecbd008s.html
+
+    def cf8_multiple_dates(self, s):
+        """Begin and end dates for MARC control field 008, Type of Date "m" """
+        begin = s[7:11]
+        end   = s[11:15]
+        return (begin, end)
+
+    def cf8_detailed_date(self, s):
+        """Begin and end dates for MARC control field 008, Type of Date "e"
+
+        Since this contains one date, begin and end are the same.
+        """
+        year  = s[7:11]
+        month = s[11:13]
+        day   = s[13:15]
+        date  = "%s-%s-%s" % (year, month, day)
+        return (date, date)
+
+    def cf8_single_date(self, s):
+        """Begin and end dates for MARC control field 008, Type of Date "s"
+
+        Since this contains one date, begin and end are the same.
+        """
+        year = s[7:11]
+        return (year, year)
+
+    def cf8_reissue_date(self, s):
+        """Begin and end dates for MARC control field 008, Type of Date "r"
+
+        Reissue date contains date reissued, and original date, if known.
+        Use the reissue date for both begin and end, because we're representing
+        one date.
+        """
+        year = s[7:11]
+        return (year, year)
+
+    def cf8_pub_copy_date(self, s):
+        """Begin and end dates for MARC control field 008, Type of Date "t"
+
+        Publication and copyright date.  We only represent the publication
+        date.
+        """
+        year = s[7:11]
+        return (year, year)
+
+    def cf8_serial_item_current(self, s):
+        """Begin and end dates, MARC control field 008, type "c"
+
+        Serial item in current publication
+        """
+        begin = s[7:11]
+        # The MARC spec says the end year is supposed to be "9999", but I've
+        # seen otherwise, and the current year looks better.  Since "9999" is
+        # a bogus value, anyway, I'm using the current year.
+        end   = str(datetime.datetime.today().year)
+        return (begin, end)
+
+    def cf8_serial_item_ceased_pub(self, s):
+        """Begin and end dates, MARC control field 008, type "d"
+
+        Serial item that has ceased publication
+        """
+        begin = s[7:11]
+        end   = s[11:15]
+        return (begin, end)
+
+    def display_date_for_none_given(self, begin, end):
+        """Construct a display date if none was given in subfield 260"""
+        if begin != end:
+            return "%s-%s" % (begin, end)
+        else:
+            return begin
+
+    def set_begin_end_dates(self, begin, end):
+        """Given begin and end, set sourceResource/date properties"""
+        display_date = getprop(self.mapped_data, "sourceResource/date", True)
+        date = {
+                "displayDate": display_date or \
+                               self.display_date_for_none_given(begin, end),
+                "begin": begin,
+                "end": end
+               }
+        setprop(self.mapped_data, "sourceResource/date", date)
+
     def map_controlfield_tags(self):
+        date_func = {
+                     "m": "cf8_multiple_dates",
+                     "q": "cf8_multiple_dates",
+                     "s": "cf8_single_date",
+                     "e": "cf8_detailed_date",
+                     "r": "cf8_reissue_date",
+                     "t": "cf8_pub_copy_date",
+                     "d": "cf8_serial_item_ceased_pub",
+                     "c": "cf8_serial_item_current"
+                    }
         for item in iterify(getprop(self.provider_data, "controlfield")):
             if "#text" in item and "tag" in item:
                 if item["tag"] == "001":
@@ -521,6 +639,11 @@ def map_controlfield_tags(self):
                         pass
                 elif item["tag"] == "008":
                     text = item["#text"]
+                    type_of_date = text[6]
+                    if type_of_date in date_func:
+                        f = getattr(self, date_func[type_of_date])
+                        (begin, end) = f(text)
+                        self.set_begin_end_dates(begin, end)
                     if len(text) > 18:
                         self.control_008_18 = text[18]
                     if len(text) > 21:

diff --git a/lib/utilities.py b/lib/utilities.py
@@ -3,6 +3,7 @@
 import sys
 import time
 import tarfile
+import re
 from functools import wraps
 
 def iterify(iterable):
@@ -94,3 +95,19 @@ def pause(attempt):
         return func_with_retries
 
     return apply_with_retries
+
+def clean_date(d):
+    """Return a given date string without certain characters and expressions"""
+    regex = [("\s*to\s*|\s[-/]\s", "-"), ("[\?\(\)]|\s*ca\.?\s*|~|x", "")]
+    if not "circa" in d and not "century" in d:
+        regex.append(("\s*c\.?\s*", ""))
+    for p, r in regex:
+        d = re.sub(p, r, d)
+    return d.strip()
+
+def remove_brackets_and_strip(d):
+    """Return a given date-range string without square brackets"""
+    return d.replace("[", "").replace("]", "").strip(". ")
+
+def strip_unclosed_brackets(s):
+    return re.sub(r'\[(?![^\]]*?\])', '', s)
diff --git a/profiles/artstor.pjs b/profiles/artstor.pjs
@@ -3,6 +3,9 @@
     "type": "oai_verbs",
     "endpoint_url": "http://oai.artstor.org/oaicatmuseum/OAIHandler",
     "sets": [
+        "SSDPLABrynMawr",
+        "SSDPLACornell",
+        "SSDPLAUCSD",
         "SSDPLAWashington",
         "SSDelwareAtlas",
         "SSDelwareGeorge",

diff --git a/setup.py b/setup.py
@@ -31,7 +31,7 @@
 from distutils.core import setup
 
 setup( name = 'ingestion',
-       version = '29.0',
+       version = '29.1',
        description='DPLA Ingestion Subsystem',
        author='Digital Public Library of America',
        author_email='[email protected]',

diff --git a/test/test_enrich_date.py b/test/test_enrich_date.py
@@ -391,18 +391,31 @@ def test_enrich_date_parse_format_date_range4():
 
 def test_enrich_date_parse_century_date():
     """Correctly transform a date of format '19th c.'"""
-    INPUT = ["19th c.", "19th century"]
-
     url = server() + "enrich_earliest_date?prop=date"
-
-    for i in INPUT:
-        input = {"date": i}
-        resp,content = H.request(url,"POST",body=json.dumps(input))
-        assert str(resp.status).startswith("2")
-
-        result = json.loads(content)
-        expected = {"date": {"begin": None, "end": None, "displayDate": i}}
-        assert result['date'] == expected[u'date'], "%s != %s" % (result['date'], expected[u'date'])
+    INPUT = {"date": "19th c."}
+    EXPECTED = {
+        "date": {
+            "begin": None,
+            "end": None,
+            "displayDate": "19th c"  # period stripped assumed OK
+        }
+    }
+    resp,content = H.request(url,"POST",body=json.dumps(INPUT))
+    result = json.loads(content)
+    assert result["date"] == EXPECTED["date"], \
+           "%s != %s" % (result["date"], EXPECTED["date"])
+    INPUT = {"date": "19th century"}
+    EXPECTED = {
+        "date": {
+            "begin": None,
+            "end": None,
+            "displayDate": "19th century"
+        }
+    }
+    resp,content = H.request(url,"POST",body=json.dumps(INPUT))
+    result = json.loads(content)
+    assert result["date"] == EXPECTED["date"], \
+           "%s != %s" % (result["date"], EXPECTED["date"])
 
 
 def test_enrich_date_parse_century_date_with_P():
@@ -414,7 +427,7 @@ def test_enrich_date_parse_century_date_with_P():
         u'date' : {
             u'begin' : None,
             u'end' : None,
-            "displayDate" : "19th c."
+            u"displayDate" : u"19th c"
         }
     }