From 6ef09e0a0a60255beec19635ff6c1b55481228c4 Mon Sep 17 00:00:00 2001 From: MuhammadIsmailShahzad <57398621+MuhammadIsmailShahzad@users.noreply.github.com> Date: Wed, 1 Jul 2020 08:48:19 +0500 Subject: [PATCH] Add new harvest dcat daera_core profile (#15) * [setup.py][dcat.py]Adds code for profile DaeraCoreProfile * [dcat.py] Fixes typo --- ckanext/opendatani/dcat.py | 63 ++++++++++++++++++++++++++++++++++++++ setup.py | 1 + 2 files changed, 64 insertions(+) diff --git a/ckanext/opendatani/dcat.py b/ckanext/opendatani/dcat.py index d92662e..478de17 100644 --- a/ckanext/opendatani/dcat.py +++ b/ckanext/opendatani/dcat.py @@ -271,6 +271,69 @@ def parse_dataset(self, dataset_dict, dataset_ref): return dataset_dict +class DaeraCoreProfile(RDFProfile): + + def parse_dataset(self, dataset_dict, dataset_ref): + + # TODO: if there is more than one source with different defaults, + # modify accordingly + dataset_dict['frequency'] = 'notPlanned' + dataset_dict['topic_category'] = 'location' + dataset_dict['lineage'] = '-' + dataset_dict['contact_name'] = 'DAERA Open Data Enquiries' + dataset_dict['contact_email'] = 'OpenDataEnquiries@daera-ni.gov.uk' + dataset_dict['license_id'] = 'uk-ogl' + + _remove_extra('contact_name', dataset_dict) + _remove_extra('contact_email', dataset_dict) + + # Ping the ArcGIS server so the processing of the files + # starts + identifier = None + avoid = [] + + if toolkit.asbool( + config.get('ckanext.opendatani.harvest.ping_arcgis_urls')): + + for extra in dataset_dict.get('extras', []): + if extra['key'] == 'identifier' and extra['value']: + identifier = extra['value'] + if identifier: + query = toolkit.get_action('package_search')( + {}, {'q': 'guid:"{0}"'.format(identifier)}) + if query['count']: + current_dataset = query['results'][0] + for current_resource in current_dataset.get('resources', + []): + if ('requested' in current_resource and + toolkit.asbool(current_resource['requested'])): + avoid.append(current_resource['url']) + + for resource in dataset_dict.get('resources', []): + if resource['format'] == 'OGC WMS': + resource['format'] = 'WMS' + + resource['requested'] = False + file_formats = ('geojson', 'kml', 'zip', 'csv') + + if resource['url'] in avoid: + resource['requested'] = True + elif resource['format'].lower() in file_formats: + try: + requests.head(resource['url']) + + resource['requested'] = True + log.debug( + 'Requested resource to start the processing: {0}' + .format(resource['url'])) + except Exception, e: + log.debug( + 'Error requesting resource: {0}\n{1}' + .format(resource['url'], e)) + pass + + return dataset_dict + def _remove_extra(key, dataset_dict): dataset_dict['extras'][:] = [e diff --git a/setup.py b/setup.py index 62a38ea..e6d22b2 100644 --- a/setup.py +++ b/setup.py @@ -89,6 +89,7 @@ causeway_profile=ckanext.opendatani.dcat:CausewayProfile midulster_profile=ckanext.opendatani.dcat:MidulsterProfile esri_arcgis_profile=ckanext.opendatani.dcat:EsriArcGISProfile + daera_core_profile=ckanext.opendatani.dcat:DaeraCoreProfile ''', )