From 1758b8c7017ab8eafc7c567b17547db9dc23fa30 Mon Sep 17 00:00:00 2001 From: mjanez <96422458+mjanez@users.noreply.github.com> Date: Tue, 19 Nov 2024 11:56:12 +0100 Subject: [PATCH] Improve dataset handling in harvesters by adding checks for dataset existence and logging warnings for missing datasets --- ckanext/schemingdcat/harvesters/sql/base.py | 62 +++++++++++++-------- ckanext/schemingdcat/harvesters/xls.py | 54 +++++++++++------- 2 files changed, 72 insertions(+), 44 deletions(-) diff --git a/ckanext/schemingdcat/harvesters/sql/base.py b/ckanext/schemingdcat/harvesters/sql/base.py index 063c648b..da390114 100644 --- a/ckanext/schemingdcat/harvesters/sql/base.py +++ b/ckanext/schemingdcat/harvesters/sql/base.py @@ -426,40 +426,54 @@ def gather_stage(self, harvest_job): log.debug(f'new ({len(new)})') log.debug(f'delete ({len(delete)})') log.debug(f'change ({len(change)})') - + ids = [] for guid in new: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - extras=[HarvestObjectExtra(key='status', value='new')]) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + extras=[HarvestObjectExtra(key='status', value='new')]) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + for guid in change: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - package_id=guid_to_package_id[guid], - extras=[HarvestObjectExtra(key='status', value='change')]) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + package_id=guid_to_package_id[guid], + extras=[HarvestObjectExtra(key='status', value='change')]) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + for guid in delete: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - package_id=guid_to_package_id[guid], - extras=[HarvestObjectExtra(key='status', value='delete')]) - model.Session.query(HarvestObject).\ - filter_by(guid=guid).\ - update({'current': False}, False) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) - + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + package_id=guid_to_package_id[guid], + extras=[HarvestObjectExtra(key='status', value='delete')]) + model.Session.query(HarvestObject).\ + filter_by(guid=guid).\ + update({'current': False}, False) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + log.debug('Number of elements in clean_datasets: %s and object_ids: %s', len(clean_datasets), len(ids)) - - #DEBUG::Log clean_datasets/ ids + + # Log clean_datasets/ ids #self._log_export_clean_datasets_and_ids(harvest_source_title, clean_datasets, ids) return [id_dict['id'] for id_dict in ids] - + def fetch_stage(self, harvest_object): + # Nothing to do here - we got the package dict in the search in the gather stage return True - - #TODO: implementar el import stage + def import_stage(self, harvest_object): """ Performs the import stage of the SchemingDCATXLSHarvester. diff --git a/ckanext/schemingdcat/harvesters/xls.py b/ckanext/schemingdcat/harvesters/xls.py index 53fcafc1..b879cc35 100644 --- a/ckanext/schemingdcat/harvesters/xls.py +++ b/ckanext/schemingdcat/harvesters/xls.py @@ -931,31 +931,45 @@ def gather_stage(self, harvest_job): log.debug(f'new ({len(new)})') log.debug(f'delete ({len(delete)})') log.debug(f'change ({len(change)})') - + ids = [] for guid in new: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - extras=[HarvestObjectExtra(key='status', value='new')]) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + extras=[HarvestObjectExtra(key='status', value='new')]) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + for guid in change: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - package_id=guid_to_package_id[guid], - extras=[HarvestObjectExtra(key='status', value='change')]) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + package_id=guid_to_package_id[guid], + extras=[HarvestObjectExtra(key='status', value='change')]) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + for guid in delete: - obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(datasets_to_harvest.get(guid)), - package_id=guid_to_package_id[guid], - extras=[HarvestObjectExtra(key='status', value='delete')]) - model.Session.query(HarvestObject).\ - filter_by(guid=guid).\ - update({'current': False}, False) - obj.save() - ids.append({'id': obj.id, 'name': datasets_to_harvest.get(guid)['name'], 'identifier': datasets_to_harvest.get(guid)['identifier']}) - + dataset = datasets_to_harvest.get(guid) + if dataset: + obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset), + package_id=guid_to_package_id[guid], + extras=[HarvestObjectExtra(key='status', value='delete')]) + model.Session.query(HarvestObject).\ + filter_by(guid=guid).\ + update({'current': False}, False) + obj.save() + ids.append({'id': obj.id, 'name': dataset['name'], 'identifier': dataset['identifier']}) + else: + log.warning(f'Dataset for GUID {guid} not found in datasets_to_harvest') + log.debug('Number of elements in clean_datasets: %s and object_ids: %s', len(clean_datasets), len(ids)) - + # Log clean_datasets/ ids #self._log_export_clean_datasets_and_ids(harvest_source_title, clean_datasets, ids)