Add new harvest dcat daera_core profile (#15)

* [setup.py][dcat.py]Adds code for profile DaeraCoreProfile * [dcat.py] Fixes typo
datopian · Jul 1, 2020 · 6ef09e0 · 6ef09e0
1 parent a875da9
commit 6ef09e0
Show file tree

Hide file tree

Showing 2 changed files with 64 additions and 0 deletions.
diff --git a/ckanext/opendatani/dcat.py b/ckanext/opendatani/dcat.py
@@ -271,6 +271,69 @@ def parse_dataset(self, dataset_dict, dataset_ref):
 
         return dataset_dict
 
+class DaeraCoreProfile(RDFProfile):
+
+    def parse_dataset(self, dataset_dict, dataset_ref):
+
+        # TODO: if there is more than one source with different defaults,
+        # modify accordingly
+        dataset_dict['frequency'] = 'notPlanned'
+        dataset_dict['topic_category'] = 'location'
+        dataset_dict['lineage'] = '-'
+        dataset_dict['contact_name'] = 'DAERA Open Data Enquiries'
+        dataset_dict['contact_email'] = '[email protected]'
+        dataset_dict['license_id'] = 'uk-ogl'
+
+        _remove_extra('contact_name', dataset_dict)
+        _remove_extra('contact_email', dataset_dict)
+
+        # Ping the ArcGIS server so the processing of the files
+        # starts
+        identifier = None
+        avoid = []
+
+        if toolkit.asbool(
+                config.get('ckanext.opendatani.harvest.ping_arcgis_urls')):
+
+            for extra in dataset_dict.get('extras', []):
+                if extra['key'] == 'identifier' and extra['value']:
+                    identifier = extra['value']
+            if identifier:
+                query = toolkit.get_action('package_search')(
+                    {}, {'q': 'guid:"{0}"'.format(identifier)})
+                if query['count']:
+                    current_dataset = query['results'][0]
+                    for current_resource in current_dataset.get('resources',
+                                                                []):
+                        if ('requested' in current_resource and
+                                toolkit.asbool(current_resource['requested'])):
+                            avoid.append(current_resource['url'])
+
+            for resource in dataset_dict.get('resources', []):
+                if resource['format'] == 'OGC WMS':
+                    resource['format'] = 'WMS'
+
+                resource['requested'] = False
+                file_formats = ('geojson', 'kml', 'zip', 'csv')
+
+                if resource['url'] in avoid:
+                    resource['requested'] = True
+                elif resource['format'].lower() in file_formats:
+                    try:
+                        requests.head(resource['url'])
+
+                        resource['requested'] = True
+                        log.debug(
+                            'Requested resource to start the processing: {0}'
+                            .format(resource['url']))
+                    except Exception, e:
+                        log.debug(
+                            'Error requesting resource: {0}\n{1}'
+                            .format(resource['url'], e))
+                        pass
+
+        return dataset_dict
+
 
 def _remove_extra(key, dataset_dict):
         dataset_dict['extras'][:] = [e

diff --git a/setup.py b/setup.py
@@ -89,6 +89,7 @@
         causeway_profile=ckanext.opendatani.dcat:CausewayProfile
         midulster_profile=ckanext.opendatani.dcat:MidulsterProfile
         esri_arcgis_profile=ckanext.opendatani.dcat:EsriArcGISProfile
+        daera_core_profile=ckanext.opendatani.dcat:DaeraCoreProfile
 
     ''',
 )