From 6d30c3661748cec732182720e2be57b42b132318 Mon Sep 17 00:00:00 2001 From: James B Date: Tue, 28 Nov 2023 12:08:43 +0000 Subject: [PATCH] Bug Fix: Correct JSON attached to correct Activity when Activity is in more than 1 document https://github.com/IATI/refresher/issues/300 Also resorted and regrouped a tiny bit of code so it's easier to read --- src/library/lakify.py | 10 +++++++--- src/library/solrize.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/src/library/lakify.py b/src/library/lakify.py index 95c9bee..65651c1 100644 --- a/src/library/lakify.py +++ b/src/library/lakify.py @@ -91,14 +91,18 @@ def process_hash_list(document_datasets): if identifiers: id_hash = utils.get_hash_for_identifier( clean_identifier(identifiers[0])) + + # XML activity_xml = etree.tostring(activity, encoding='utf-8') - activity_json = recursive_json_nest(activity, {}) act_blob_client = blob_service_client.get_blob_client( - container=config['ACTIVITIES_LAKE_CONTAINER_NAME'], blob='{}.xml'.format(id_hash)) + container=config['ACTIVITIES_LAKE_CONTAINER_NAME'], blob='{}/{}.xml'.format(doc_id, id_hash)) act_blob_client.upload_blob(activity_xml, overwrite=True) act_blob_client.set_blob_tags({"dataset_hash": file_hash}) + + # JSON + activity_json = recursive_json_nest(activity, {}) act_blob_json_client = blob_service_client.get_blob_client( - container=config['ACTIVITIES_LAKE_CONTAINER_NAME'], blob='{}.json'.format(id_hash)) + container=config['ACTIVITIES_LAKE_CONTAINER_NAME'], blob='{}/{}.json'.format(doc_id, id_hash)) act_blob_json_client.upload_blob( json.dumps(activity_json, ensure_ascii=False).replace( '{http://www.w3.org/XML/1998/namespace}', 'xml:').encode('utf-8'), diff --git a/src/library/solrize.py b/src/library/solrize.py index 35ec8eb..caa2d42 100644 --- a/src/library/solrize.py +++ b/src/library/solrize.py @@ -143,7 +143,7 @@ def process_hash_list(document_datasets): for fa in flattened_activities[0]: hashed_identifier = utils.get_hash_for_identifier( fa['iati_identifier']) - blob_name = '{}.xml'.format(hashed_identifier) + blob_name = '{}/{}.xml'.format(file_id, hashed_identifier) try: blob_client = blob_service_client.get_blob_client( @@ -165,7 +165,7 @@ def process_hash_list(document_datasets): raise SolrizeSourceError('Could not identify charset for blob: ' + blob_name + ', file hash: ' + file_hash + ', iati-identifier: ' + fa['iati_identifier']) - json_blob_name = '{}.json'.format(hashed_identifier) + json_blob_name = '{}/{}.json'.format(file_id, hashed_identifier) try: json_blob_client = blob_service_client.get_blob_client(