From 0697cc480815b3e40cd1ecd47822b4aac4955ac8 Mon Sep 17 00:00:00 2001 From: Abram Booth Date: Thu, 25 May 2017 09:27:43 -0400 Subject: [PATCH] Add tind.io harvester for AgEcon with MODS --- setup.py | 3 +- share/harvesters/edu_ageconsearch.py | 117 ----------------- share/harvesters/io_tind.py | 81 ++++++++++++ share/sources/edu.ageconsearch/source.yaml | 16 ++- share/transformers/edu_ageconsearch.py | 138 --------------------- share/transformers/mods.py | 21 +++- 6 files changed, 107 insertions(+), 269 deletions(-) delete mode 100644 share/harvesters/edu_ageconsearch.py create mode 100644 share/harvesters/io_tind.py delete mode 100644 share/transformers/edu_ageconsearch.py diff --git a/setup.py b/setup.py index 93493cf24..2fb639274 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,6 @@ 'com.peerj.xml = share.transformers.com_peerj_xml:PeerJXMLTransformer', 'com.researchregistry = share.transformers.com_researchregistry:RRTransformer', 'com.springer = share.transformers.com_springer:SpringerTransformer', - 'edu.ageconsearch = share.transformers.edu_ageconsearch:AgeconTransformer', 'edu.gwu = share.transformers.edu_gwu:GWScholarSpaceTransformer', 'edu.harvarddataverse = share.transformers.edu_harvarddataverse:HarvardTransformer', 'gov.clinicaltrials = share.transformers.gov_clinicaltrials:ClinicalTrialsTransformer', @@ -57,7 +56,6 @@ 'com.peerj = share.harvesters.com_peerj:PeerJHarvester', 'com.researchregistry = share.harvesters.com_researchregistry:ResearchRegistryHarvester', 'com.springer = share.harvesters.com_springer:SpringerHarvester', - 'edu.ageconsearch = share.harvesters.edu_ageconsearch:AgEconHarvester', 'edu.gwu = share.harvesters.edu_gwu:GWScholarSpaceHarvester', 'edu.harvarddataverse = share.harvesters.edu_harvarddataverse:HarvardDataverseHarvester', 'gov.clinicaltrials = share.harvesters.gov_clinicaltrials:ClinicalTrialsHarvester', @@ -67,6 +65,7 @@ 'gov.scitech = share.harvesters.gov_scitech:SciTechHarvester', 'gov.usgs = share.harvesters.gov_usgs:USGSHarvester', 'io.osf = share.harvesters.io_osf:OSFHarvester', + 'io.tind = share.harvesters.io_tind:TindHarvester', 'oai = share.harvesters.oai:OAIHarvester', 'org.arxiv = share.harvesters.org_arxiv:ArxivHarvester', 'org.biorxiv = share.harvesters.org_biorxiv:BiorxivHarvester', diff --git a/share/harvesters/edu_ageconsearch.py b/share/harvesters/edu_ageconsearch.py deleted file mode 100644 index 4f340e5f9..000000000 --- a/share/harvesters/edu_ageconsearch.py +++ /dev/null @@ -1,117 +0,0 @@ -import logging -import dateutil - -from bs4 import BeautifulSoup -from furl import furl -import pendulum - -from share.harvest import BaseHarvester - -logger = logging.getLogger('__name__') - - -class AgEconHarvester(BaseHarvester): - """ - Query Parameters: - month (MM) - year (YYYY) - order (oldestFirst or None) - starts_with (YYYY-MM-DD) they don't always have a day - top (page number) - - Returns: - Page with nearest date - 20 records/page - """ - VERSION = 1 - - fields = { - 'title': 'title', - 'other titles': 'other_titles', - 'authors': 'authors', - 'editors': 'editors', - 'editors (email)': 'editors_email', - 'authors (email)': 'authors_email', - 'keywords': 'keywords', - 'jel codes': 'jel_codes', - 'issue date': 'issue_date', - 'series/report no.': 'series_report_number', - 'abstract': 'abstract', - 'uri': 'uri', - 'institution/association': 'institution_association', - 'identifiers': 'identifiers', - 'total pages': 'total_pages', - 'from page': 'from_page', - 'to page': 'to_page', - 'notes': 'notes', - 'collections:': 'collections', - } - - # Request page with nearest date - def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum): - return self.fetch_records(start_date, end_date) - - # Fetch the list of work urls on a single result page and return results within date range - def fetch_records(self, start_date, end_date): - logger.info('Harvesting %s - %s', start_date, end_date) - logger.debug('Fetching page %s', self.config.base_url) - - url = furl(self.config.base_url) - url.args['starts_with'] = start_date - r = self.requests.get(url.url) - - r.raise_for_status() - within_date_range = True - while within_date_range: - document = BeautifulSoup(r.text, 'html.parser') - results = document.select('a[href^="/handle/"]')[1:] - for result in results: - url = 'http://ageconsearch.umn.edu{}'.format(result.attrs['href']) - work = self.fetch_work(url) - date_status = self.check_record_date(work['issue_date'], start_date, end_date) - - # if date is > start_date continue and skip - if date_status == 'after': - continue - elif date_status == 'before': - within_date_range = False - return - yield work['primary_identifier'], work - - r = self.requests.get('http://ageconsearch.umn.edu/{}'.format(document.find('a', string='Next page').attrs['href'])) - - def check_record_date(self, issue_date, start_date, end_date): - date_object = dateutil.parser.parse(issue_date, default=pendulum.create(2016, 1, 1)) - - if date_object < start_date.start_of('day'): - return 'before' - if date_object > end_date.end_of('day'): - return 'after' - - return 'within' - - # Pull data out of html - def fetch_work(self, url): - r = self.requests.get(url) - r.raise_for_status() - soup = BeautifulSoup(r.text, 'lxml') - data = {} - - data['primary_identifier'] = soup.find('code').text - display_table = soup.find(class_='itemDisplayTable').find_all('tr') - - for row in display_table: - label = row.find(class_='metadataFieldLabel').text.replace(':\xa0', '').lower() - value_object = row.find(class_='metadataFieldValue') - if value_object.string: - value = value_object.string - else: - contents = [] - for content in value_object.contents: - contents.append(content.string or content) - # Feels a little hacky - value = [val for val in contents if val != BeautifulSoup('
', 'lxml').br] - - data[self.fields[label]] = value - - return data diff --git a/share/harvesters/io_tind.py b/share/harvesters/io_tind.py new file mode 100644 index 000000000..47166bebd --- /dev/null +++ b/share/harvesters/io_tind.py @@ -0,0 +1,81 @@ +import logging +import dateutil + +from furl import furl +from lxml import etree +import pendulum + +from share.harvest import BaseHarvester + +logger = logging.getLogger('__name__') + + +class TindHarvester(BaseHarvester): + """ + Expected harvester kwargs: + collection: collection name to harvest + page_size: records per request + format_code: + 'xo': MODS XML + 'xd': Dublin Core-ish XML + 'xm': MARC XML + 'hm': MARC + 'hb': HTML + + API Query Parameters: + dt (type of date filter: 'm' for date modified) + d1d (start of date range day) + d1m (start of date range month) + d1y (start of date range year) + d2d (end of date range day) + d2m (end of date range month) + d2y (end of date range year) + sc (split by collection: 0 or 1) + sf (sort field: e.g. 'latest first') + so (sort order: 'a' for ascending, 'd' for descending) + rg (page size) + jrec (offset) + of (format code, see above) + """ + VERSION = 1 + + namespaces = { + 'mods': 'http://www.loc.gov/mods/v3', + } + + def do_harvest(self, start_date: pendulum.Pendulum, end_date: pendulum.Pendulum): + page_size = self.kwargs['page_size'] + offset = 1 + url = furl(self.config.base_url) + url.args.update({ + 'c': self.kwargs['collection'], + 'of': self.kwargs['format_code'], + 'rg': page_size, + 'dt': 'm', + 'd1d': start_date.day, + 'd1m': start_date.month, + 'd1y': start_date.year, + 'd2d': end_date.day, + 'd2m': end_date.month, + 'd2y': end_date.year, + 'sc': 0, # Splitting by collection screws up the page size + 'sf': 'latest first', + 'so': 'd', + }) + + while True: + logger.debug('Making request to %s', url.url) + resp = self.requests.get(url.url) + resp.raise_for_status() + + parsed = etree.fromstring(resp.content, parser=etree.XMLParser(recover=True)) + records = parsed.xpath('/modsCollection/mods:mods', namespaces=self.namespaces) + if not records: + break + + for record in records: + id = record.xpath('mods:recordInfo/mods:recordIdentifier', namespaces=self.namespaces)[0].text + yield (id, etree.tostring(record, encoding=str)) + + offset += page_size + url.args['jrec'] = offset diff --git a/share/sources/edu.ageconsearch/source.yaml b/share/sources/edu.ageconsearch/source.yaml index 7d38637f1..4102e4192 100644 --- a/share/sources/edu.ageconsearch/source.yaml +++ b/share/sources/edu.ageconsearch/source.yaml @@ -1,14 +1,18 @@ configs: -- base_url: http://ageconsearch.umn.edu/browse-date +- base_url: http://ageconsearch.tind.io/search disabled: false earliest_date: null - harvester: edu.ageconsearch - harvester_kwargs: {} - label: edu.ageconsearch + harvester: io.tind + harvester_kwargs: + collection: AgEcon Search + page_size: 100 + format_code: xo + label: edu.ageconsearch.tind rate_limit_allowance: 1 rate_limit_period: 2 - transformer: edu.ageconsearch - transformer_kwargs: {} + transformer: mods + transformer_kwargs: + emitted_type: Preprint home_page: http://ageconsearch.umn.edu/ long_title: AgEcon Search name: edu.ageconsearch diff --git a/share/transformers/edu_ageconsearch.py b/share/transformers/edu_ageconsearch.py deleted file mode 100644 index 4052ae71f..000000000 --- a/share/transformers/edu_ageconsearch.py +++ /dev/null @@ -1,138 +0,0 @@ -import re - -from share.transform.chain import * - - -class WorkIdentifier(Parser): - uri = IRI(ctx) - - -class AgentIdentifier(Parser): - uri = ctx - - -class Agent(Parser): - schema = GuessAgentType(ctx.name) - name = ctx.name - identifiers = Map(Delegate(AgentIdentifier), Try(IRI(ctx.email))) - - -class ContributorRelation(Parser): - schema = 'Contributor' - - agent = Delegate(Agent, ctx) - cited_as = ctx.name - - -class CreatorRelation(ContributorRelation): - schema = 'Creator' - - order_cited = ctx('index') - - -class AffiliatedAgent(Parser): - schema = GuessAgentType(ctx, default='organization') - name = ctx - - -class AgentWorkRelation(Parser): - agent = Delegate(AffiliatedAgent, ctx) - - -class Tag(Parser): - name = ctx - - -class ThroughTags(Parser): - tag = Delegate(Tag, ctx) - - -class Subject(Parser): - name = ctx - - -class ThroughSubjects(Parser): - subject = Delegate(Subject, ctx) - - -class Preprint(Parser): - title = Try(ctx.title) - description = Try(ctx.abstract) - identifiers = Concat( - Map(Delegate(WorkIdentifier), ctx.primary_identifier), - Map(Delegate(WorkIdentifier), ctx.uri), - Map(Delegate(WorkIdentifier), Try(ctx.identifiers)), - ) - - related_agents = Concat( - Map( - Delegate(CreatorRelation), - RunPython('get_agent_emails', ctx, 'authors', 'authors_email') - ), - Map( - Delegate(ContributorRelation), - RunPython('get_agent_emails', ctx, 'editors', 'editors_email') - ), - Map( - Delegate(AgentWorkRelation), - RunPython('get_affiliated_organization', Try(ctx.institution_association)) - ) - ) - - tags = Map(Delegate(ThroughTags), Try(ctx.keywords)) - date_published = ParseDate(Try(ctx.issue_date)) - subjects = Map(Delegate(ThroughSubjects), Subjects(Try(ctx.jel_codes))) - - class Extra: - other_titles = Try(ctx.other_titles) - notes = Try(ctx.notes) - editors = Try(ctx.editors) - editors_email = Try(ctx.editors_email) - authors = Try(ctx.authors) - authors_email = Try(ctx.authors_email) - series_report_number = Try(ctx.series_report_number) - institution_association = Try(ctx.institution_association) - collections = Try(ctx.collections) - total_pages = Try(ctx.total_pages) - from_page = Try(ctx.from_page) - to_page = Try(ctx.to_page) - identifiers = Try(ctx.identifiers) - uri = ctx.uri - - def get_agent_emails(self, ctx, agent_key, email_key): - """ - emails format: [name (email), name (email)] - """ - try: - agents = ctx[agent_key] if isinstance(ctx[agent_key], list) else [ctx[agent_key]] - except KeyError: - agents = [] - - try: - emails = ctx[email_key] if isinstance(ctx[email_key], list) else [ctx[email_key]] - except KeyError: - emails = [] - - agent_objects = [] - - for agent in agents: - agent_object = {'name': agent} - - agent_email = next((x for x in emails if agent in x), None) - - if agent_email: - agent_object['email'] = re.compile('\((\S+?)\)').search(agent_email).group(1) - agent_objects.append(agent_object) - - return agent_objects - - def get_affiliated_organization(self, affiliation): - """ - affiliation format: 'name>volume issue etc' - """ - return affiliation.split('>')[0] - - -class AgeconTransformer(ChainTransformer): - VERSION = 1 - root_parser = Preprint diff --git a/share/transformers/mods.py b/share/transformers/mods.py index dfc04709f..5bcd311d3 100644 --- a/share/transformers/mods.py +++ b/share/transformers/mods.py @@ -240,7 +240,8 @@ class MODSCreativeWork(Parser): lambda obj: 'invalid' not in obj, tools.Concat( tools.Try(ctx['mods:identifier']), - tools.Try(ctx.header['identifier']) + tools.Try(ctx.header['identifier']), + tools.Try(ctx['mods:location']['mods:url']), ) ) ) @@ -313,7 +314,11 @@ class MODSCreativeWork(Parser): ) ) - date_updated = tools.ParseDate(tools.Try(ctx.header.datestamp)) + date_updated = tools.OneOf( + tools.ParseDate(ctx.header.datestamp), + tools.ParseDate(ctx['mods:recordInfo']['mods:recordChangeDate']), + tools.Static(None) + ) # TODO (in regulator) handle date ranges, uncertain dates ('1904-1941', '1890?', '1980-', '19uu', etc.) date_published = tools.OneOf( @@ -492,7 +497,11 @@ def do_transform(self, data): def unwrap_data(self, data): unwrapped_data = xmltodict.parse(data, process_namespaces=True, namespaces=self.kwargs.get('namespaces', self.NAMESPACES)) - return { - **unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}), - 'header': unwrapped_data['record']['header'], - } + if 'record' in unwrapped_data: + return { + **unwrapped_data['record'].get('metadata', {}).get('mods:mods', {}), + 'header': unwrapped_data['record']['header'], + } + elif 'mods:mods' in unwrapped_data: + return unwrapped_data['mods:mods'] + raise ValueError('Unrecognized MODS wrapper!\n{}'.format(data))