From 1394cecfd324dea5467dbaaa54f8dcd9f5367e5e Mon Sep 17 00:00:00 2001 From: Biel Stela Date: Fri, 17 Jan 2025 14:16:30 +0100 Subject: [PATCH] Fixes h3 tables having partial/missing regions. With the last datasets updates the nodata handling changed meaning that the results of raster to h3 have different cardinalities. The join supposed all dataframes equal. It failed by keeping only the indexes of the first in the list. --- data/h3_data_importer/delete_h3_tables.py | 8 +++++++- data/h3_data_importer/raster_folder_to_h3_table.py | 6 ++---- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/data/h3_data_importer/delete_h3_tables.py b/data/h3_data_importer/delete_h3_tables.py index c72f9adfb..71ee97908 100644 --- a/data/h3_data_importer/delete_h3_tables.py +++ b/data/h3_data_importer/delete_h3_tables.py @@ -1,3 +1,8 @@ +"""Script to delete dangling h3 tables that are no longer used. + +The delete criteria is if the table is referenced anywhere else. +""" + import logging import click @@ -13,6 +18,7 @@ @click.option("--drop-contextuals", is_flag=True) @click.option("--dry-run", is_flag=True) def main(drop_contextuals: bool, dry_run: bool): + """Delete dangling h3 tables that are no longer used""" with psycopg.connect(get_connection_info()) as conn: with conn.cursor() as cursor: # find all the tables that start with h3_grid* @@ -51,7 +57,7 @@ def main(drop_contextuals: bool, dry_run: bool): """DELETE FROM contextual_layer WHERE id = ANY(%s); """, - (list(ctx[0] for ctx in contextuals_to_drop),), + ([ctx[0] for ctx in contextuals_to_drop],), ) log.info(f"Deleted contextual layers {', '.join(str(ctx[0]) for ctx in contextuals_to_drop)}") else: diff --git a/data/h3_data_importer/raster_folder_to_h3_table.py b/data/h3_data_importer/raster_folder_to_h3_table.py index fade65420..5b0b491ed 100644 --- a/data/h3_data_importer/raster_folder_to_h3_table.py +++ b/data/h3_data_importer/raster_folder_to_h3_table.py @@ -64,7 +64,6 @@ def raster_to_h3(reference_raster: Path, h3_resolution: int, raster_file: Path) with rio.open(reference_raster) as ref: check_srs(ref, raster) check_transform(ref, raster) - h3 = h3ronpy.raster.raster_to_dataframe( raster.read(1), transform=raster.transform, @@ -261,12 +260,11 @@ def main(folder: Path, table: str, data_type: str, dataset: str, year: int, h3_r with multiprocessing.Pool(thread_count) as pool: h3s = pool.map(partial_raster_to_h3, raster_files) log.info(f"Joining H3 data of each raster into single dataframe for table {table}") - df = h3s[0] + df: pd.DataFrame = h3s[0] with click.progressbar(h3s[1:], label="Joining H3 dataframes") as pbar: for h3df in pbar: - df = df.join(h3df) + df = df.join(h3df, how="outer") del h3df - # Part 2: Ingest h3 index into the database to_the_db(df, table, data_type, dataset, year, h3_res)