From 6ef09e0a0a60255beec19635ff6c1b55481228c4 Mon Sep 17 00:00:00 2001
From: MuhammadIsmailShahzad
 <57398621+MuhammadIsmailShahzad@users.noreply.github.com>
Date: Wed, 1 Jul 2020 08:48:19 +0500
Subject: [PATCH] Add new harvest dcat daera_core profile (#15)

* [setup.py][dcat.py]Adds code for profile DaeraCoreProfile

* [dcat.py] Fixes typo
---
 ckanext/opendatani/dcat.py | 63 ++++++++++++++++++++++++++++++++++++++
 setup.py                   |  1 +
 2 files changed, 64 insertions(+)

diff --git a/ckanext/opendatani/dcat.py b/ckanext/opendatani/dcat.py
index d92662e..478de17 100644
--- a/ckanext/opendatani/dcat.py
+++ b/ckanext/opendatani/dcat.py
@@ -271,6 +271,69 @@ def parse_dataset(self, dataset_dict, dataset_ref):
 
         return dataset_dict
 
+class DaeraCoreProfile(RDFProfile):
+
+    def parse_dataset(self, dataset_dict, dataset_ref):
+
+        # TODO: if there is more than one source with different defaults,
+        # modify accordingly
+        dataset_dict['frequency'] = 'notPlanned'
+        dataset_dict['topic_category'] = 'location'
+        dataset_dict['lineage'] = '-'
+        dataset_dict['contact_name'] = 'DAERA Open Data Enquiries'
+        dataset_dict['contact_email'] = 'OpenDataEnquiries@daera-ni.gov.uk'
+        dataset_dict['license_id'] = 'uk-ogl'
+
+        _remove_extra('contact_name', dataset_dict)
+        _remove_extra('contact_email', dataset_dict)
+
+        # Ping the ArcGIS server so the processing of the files
+        # starts
+        identifier = None
+        avoid = []
+
+        if toolkit.asbool(
+                config.get('ckanext.opendatani.harvest.ping_arcgis_urls')):
+
+            for extra in dataset_dict.get('extras', []):
+                if extra['key'] == 'identifier' and extra['value']:
+                    identifier = extra['value']
+            if identifier:
+                query = toolkit.get_action('package_search')(
+                    {}, {'q': 'guid:"{0}"'.format(identifier)})
+                if query['count']:
+                    current_dataset = query['results'][0]
+                    for current_resource in current_dataset.get('resources',
+                                                                []):
+                        if ('requested' in current_resource and
+                                toolkit.asbool(current_resource['requested'])):
+                            avoid.append(current_resource['url'])
+
+            for resource in dataset_dict.get('resources', []):
+                if resource['format'] == 'OGC WMS':
+                    resource['format'] = 'WMS'
+
+                resource['requested'] = False
+                file_formats = ('geojson', 'kml', 'zip', 'csv')
+
+                if resource['url'] in avoid:
+                    resource['requested'] = True
+                elif resource['format'].lower() in file_formats:
+                    try:
+                        requests.head(resource['url'])
+
+                        resource['requested'] = True
+                        log.debug(
+                            'Requested resource to start the processing: {0}'
+                            .format(resource['url']))
+                    except Exception, e:
+                        log.debug(
+                            'Error requesting resource: {0}\n{1}'
+                            .format(resource['url'], e))
+                        pass
+
+        return dataset_dict
+
 
 def _remove_extra(key, dataset_dict):
         dataset_dict['extras'][:] = [e
diff --git a/setup.py b/setup.py
index 62a38ea..e6d22b2 100644
--- a/setup.py
+++ b/setup.py
@@ -89,6 +89,7 @@
         causeway_profile=ckanext.opendatani.dcat:CausewayProfile
         midulster_profile=ckanext.opendatani.dcat:MidulsterProfile
         esri_arcgis_profile=ckanext.opendatani.dcat:EsriArcGISProfile
+        daera_core_profile=ckanext.opendatani.dcat:DaeraCoreProfile
 
     ''',
 )