Skip to content

Commit

Permalink
feat: add CSIRO harvester
Browse files Browse the repository at this point in the history
  • Loading branch information
smotornyuk committed Oct 20, 2024
1 parent 1367232 commit 5b12edc
Show file tree
Hide file tree
Showing 7 changed files with 178 additions and 18 deletions.
2 changes: 2 additions & 0 deletions ckanext/harvest_basket/harvesters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from ckanext.harvest_basket.harvesters.ods_harvester import ODSHarvester
from .dcat import BasketDcatJsonHarvester
from .csw import BasketCswHarvester
from .csiro import CsiroHarvester

__all__ = [
"DKANHarvester",
Expand All @@ -16,4 +17,5 @@
"ODSHarvester",
"BasketDcatJsonHarvester",
"BasketCswHarvester",
"CsiroHarvester",
]
11 changes: 9 additions & 2 deletions ckanext/harvest_basket/harvesters/base_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import logging
import json
import uuid
from typing import Optional
from typing import Any, Optional
from datetime import datetime as dt
from dateutil import parser
from html import unescape
Expand All @@ -14,6 +14,7 @@
from ckan.plugins import plugin_loaded
from ckan import model
from ckan.lib.munge import munge_tag
from ckanext.transmute.utils import get_schema

from ckanext.harvest.harvesters.base import HarvesterBase
try:
Expand Down Expand Up @@ -163,6 +164,12 @@ def _fetch_tags(self, tag_list: list[str]) -> list[dict[str, str]]:

return tags

def _transmute_content(self, package_dict: dict[str, Any]):
schema = self.config.get("tsm_schema")
if not schema and (schema_name := self.config.get("tsm_named_schema")):
schema = get_schema(schema_name)
self.transmute_data(package_dict, schema)

def import_stage(self, harvest_object):
self.base_context = {
"model": model,
Expand All @@ -182,7 +189,7 @@ def import_stage(self, harvest_object):

package_dict = json.loads(harvest_object.content)

self.transmute_data(package_dict, config.get("tsm_schema"))
self._transmute_content(package_dict)

if package_dict.get("type") == "harvest":
log.info("Remote dataset is a harvest source, ignoring...")
Expand Down
6 changes: 1 addition & 5 deletions ckanext/harvest_basket/harvesters/ckan_harvester.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,7 @@ def import_stage(self, harvest_object):
package_dict = json.loads(harvest_object.content)
self._set_config(harvest_object.source.config)

schema = self.config.get("tsm_schema")
if not schema and (schema_name := self.config.get("tsm_named_schema")):
schema = get_schema(schema_name)

self.transmute_data(package_dict, schema)
self._transmute_content(package_dict)

harvest_object.content = json.dumps(package_dict)
super().import_stage(harvest_object)
Expand Down
163 changes: 163 additions & 0 deletions ckanext/harvest_basket/harvesters/csiro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
from __future__ import annotations

import contextlib
import logging
import json
import uuid
from typing import Any, Iterable

from ckan.lib.munge import munge_name, munge_tag
from ckan.logic import ValidationError
import ckan.plugins.toolkit as tk

from ckan import model
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.harvesters.ckanharvester import SearchError
from ckanext.spatial import harvesters

from ckanext.harvest_basket.harvesters.base_harvester import BasketBasicHarvester


log = logging.getLogger(__name__)


from requests_cache import install_cache


def ff(resp):
return resp.url.startswith("https://data.csiro.au/")


install_cache("/tmp/csw-harvester", "sqlite", filter_fn=ff)


class CsiroHarvester(BasketBasicHarvester):
SRC_ID = "CSIRO"

def info(self):
return {
"name": "csiro",
"title": "CSIRO",
"description": "Harvests datasets from CSIRO DAP",
}

def gather_stage(self, harvest_job):
source_url: str = harvest_job.source.url.strip("/")
self._set_config(harvest_job.source.config)
log.info(f"{self.SRC_ID}: gather stage started: {source_url}")

object_ids = []
try:
for record in self._search_datasets(source_url):
identifier = munge_name(
"_".join(
[record["id"]["identifierType"], record["id"]["identifier"]]
)
)
log.info(
"%s: Creating HARVEST object for %s",
self.SRC_ID,
identifier,
)

guid = uuid.uuid5(uuid.NAMESPACE_DNS, identifier)
obj = HarvestObject(
guid=identifier, job=harvest_job, content=json.dumps(record)
)
obj.save()
object_ids.append(obj.id)

except SearchError:
log.exception("%s: search for datasets failed", self.SRC_ID)
self._save_gather_error(
f"{self.SRC_ID}: unable to search the remote portal for datasets: {source_url}",
harvest_job,
)

if not object_ids:
log.error("%s: search returns empty result.", self.SRC_ID)
self._save_gather_error(
f"{self.SRC_ID}: no datasets found at ODS remote portal: {source_url}",
harvest_job,
)

return object_ids

def _search_datasets(self, url: str) -> Iterable[dict[str, Any]]:
next_url = url + "/collections.json?rpp=100"
while True:
if not (resp := self._make_request(next_url)):
break

data = resp.json()
yield from data["dataCollections"]

if next_info := data["next"]:
next_url = next_info["href"]
else:
break

def fetch_stage(self, harvest_object):
self._set_config(harvest_object.source.config)
source_url = self._get_src_url(harvest_object)
package_dict = json.loads(harvest_object.content)
url = f"{source_url}/collections/{package_dict['id']['identifier']}.json"
log.debug("Fetch %s", url)

if not (resp := self._make_request(url)):
return False
metadata = resp.json()

if "data" in metadata:
with contextlib.suppress(tk.ValidationError):
if resp := self._make_request(metadata["data"] + ".json"):
data = resp.json()
metadata["files"] = data["file"]

harvest_object.content = json.dumps(metadata)
return True

def import_stage(self, harvest_object):
self.base_context = {
"model": model,
"session": model.Session,
"user": self._get_user_name(),
}

self._set_config(harvest_object.source.config)

package_dict = {}

data = json.loads(harvest_object.content)

log.debug("Import %s", data["id"])

package_dict["name"] = munge_name(
"_".join([data["id"]["identifierType"], data["id"]["identifier"]])
)
package_dict["id"] = str(uuid.uuid5(uuid.NAMESPACE_DNS, package_dict["name"]))

package_dict["title"] = data.pop("title")
package_dict["notes"] = data.pop("description")
package_dict["tags_string"] = [
{"name": munge_tag(tag)} for tag in data["keywords"].split(";")
]

package_dict["author"] = data.pop("leadResearcher")

package_dict["resources"] = [
{
"name": item["filename"],
"size": item["fileSize"],
"url": item["link"]["href"],
"format": item["link"]["mediaType"],
"extras": [{"key": k, "value": v} for k, v in item.items()],
}
for item in data.pop("files", [])
]

package_dict["extras"] = [
{"key": key, "value": value} for key, value in data.items()
]
harvest_object.content = json.dumps(package_dict)
super().import_stage(harvest_object)
6 changes: 1 addition & 5 deletions ckanext/harvest_basket/harvesters/csw.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,11 +99,7 @@ def get_package_dict(self, iso_values, harvest_object):
self.base_context = {"user": self._get_user_name()}
self._set_config(harvest_object.source.config)

schema = self.config.get("tsm_schema")
if not schema and (schema_name := self.config.get("tsm_named_schema")):
schema = get_schema(schema_name)

self.transmute_data(package_dict, schema)
self._transmute_content(package_dict)
return package_dict

def info(self):
Expand Down
7 changes: 1 addition & 6 deletions ckanext/harvest_basket/harvesters/dcat.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,5 @@ def modify_package_dict(self, package_dict, dcat_dict, harvest_object):
package_dict = json.loads(harvest_object.content)
self._set_config(harvest_object.source.config)

schema = self.config.get("tsm_schema")
if not schema and (schema_name := self.config.get("tsm_named_schema")):
schema = get_schema(schema_name)

self.transmute_data(package_dict, schema)

self._transmute_content(package_dict)
return package_dict
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
ods_harvester=ckanext.harvest_basket.harvesters:ODSHarvester
basket_dcat_json_harvester=ckanext.harvest_basket.harvesters:BasketDcatJsonHarvester
basket_csw_harvester=ckanext.harvest_basket.harvesters:BasketCswHarvester
basket_csiro_harvester=ckanext.harvest_basket.harvesters:CsiroHarvester
[babel.extractors]
ckan = ckan.lib.extract:extract_ckan
Expand Down

0 comments on commit 5b12edc

Please sign in to comment.