From af8dbfc8b9a0902677c639047488d8cfcc7bcc41 Mon Sep 17 00:00:00 2001 From: Anthony Lukach Date: Tue, 3 Sep 2024 21:47:19 -0700 Subject: [PATCH] Expand pre-commit format logic --- .pre-commit-config.yaml | 12 +- data/config/fill_location.py | 14 +- .../zonal_pop_NTL_VIIRS_EOG.py | 66 +- notebooks/MP_SCRIPTS/zonal_fathom.py | 121 ++-- .../MP_SCRIPTS/zonal_pop_NTL_VIIRS_LEN.py | 66 +- notebooks/MP_SCRIPTS/zonal_pop_by_gender.py | 83 +-- notebooks/MP_SCRIPTS/zonal_urbanization.py | 80 +-- postgres/chunk_parquet.py | 10 +- postgres/nyc_sample.py | 29 +- space2stats_api/cdk/app.py | 3 +- space2stats_api/cdk/aws_stack.py | 26 +- space2stats_api/cdk/settings.py | 1 + space2stats_api/src/space2stats/__main__.py | 6 +- space2stats_api/src/space2stats/app.py | 9 +- space2stats_api/src/space2stats/handler.py | 2 +- space2stats_api/src/space2stats/main.py | 12 +- space2stats_api/src/tests/test_api.py | 19 +- space2stats_api/src/tests/test_h3_utils.py | 1 - src/country_zonal.py | 565 +++++++++++------- src/global_zonal.py | 332 ++++++---- src/h3_helper.py | 353 +++++++---- src/space2stats_data_config.py | 18 +- 22 files changed, 1124 insertions(+), 704 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index d8ad7b7..2b6fdaf 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,13 +1,21 @@ repos: + - repo: https://github.com/PyCQA/isort + rev: 5.13.2 + hooks: + - id: isort + language_version: python + args: ["-m", "3", "--trailing-comma", "-l", "88"] + - repo: https://github.com/charliermarsh/ruff-pre-commit - rev: v0.5.5 + rev: v0.5.5 hooks: - id: ruff args: [--fix] files: ^space2stats_api/ + - id: ruff-format - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.11.1 + rev: v1.11.1 hooks: - id: mypy args: [--ignore-missing-imports] diff --git a/data/config/fill_location.py b/data/config/fill_location.py index 588b14d..b0e14c5 100644 --- a/data/config/fill_location.py +++ b/data/config/fill_location.py @@ -1,21 +1,21 @@ import json # Load data from iso3.json -with open('iso3.json', 'r') as f: +with open("iso3.json", "r") as f: iso3_data = json.load(f) # Load the template -with open('location_template.json', 'r') as f: +with open("location_template.json", "r") as f: location_template = json.load(f) # Create a filled configuration location_filled = {"locations": []} for iso3, country_name in iso3_data.items(): - location = location_template['locations'][0].copy() - location['ISO3'] = iso3 - location['country_name'] = country_name - location_filled['locations'].append(location) + location = location_template["locations"][0].copy() + location["ISO3"] = iso3 + location["country_name"] = country_name + location_filled["locations"].append(location) # Save the filled configuration -with open('location_filled.json', 'w') as f: +with open("location_filled.json", "w") as f: json.dump(location_filled, f, indent=2) diff --git a/notebooks/IMPLEMENTATIONS/ZON_MNACE_Compile_NTL/zonal_pop_NTL_VIIRS_EOG.py b/notebooks/IMPLEMENTATIONS/ZON_MNACE_Compile_NTL/zonal_pop_NTL_VIIRS_EOG.py index 4667dd3..0bda932 100644 --- a/notebooks/IMPLEMENTATIONS/ZON_MNACE_Compile_NTL/zonal_pop_NTL_VIIRS_EOG.py +++ b/notebooks/IMPLEMENTATIONS/ZON_MNACE_Compile_NTL/zonal_pop_NTL_VIIRS_EOG.py @@ -1,69 +1,79 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys -import pandas as pd import geopandas as gpd -#import numpy as np - -from h3 import h3 - -import GOSTrocks.rasterMisc as rMisc import GOSTrocks.ntlMisc as ntl +import GOSTrocks.rasterMisc as rMisc +import pandas as pd from GOSTrocks.misc import tPrint +from h3 import h3 + +# import numpy as np + sys.path.append("../../src") import h3_helper -AWS_S3_BUCKET = 'wbg-geography01' +AWS_S3_BUCKET = "wbg-geography01" AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") + def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): - cName = f'{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}' + cName = f"{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}" if verbose: - tPrint(f'Starting {cName}') + tPrint(f"Starting {cName}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) + gdf["geometry"] = gdf["geometry"].buffer(0) res = rMisc.zonalStats(gdf, cur_raster_file, minVal=0, verbose=False) - res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) - res['id'] = gdf['id'].values + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["id"] = gdf["id"].values if verbose: - tPrint(f'**** finished {cName}') - return({out_file:res}) + tPrint(f"**** finished {cName}") + return {out_file: res} + if __name__ == "__main__": - multiprocess=True + multiprocess = True verbose = True tPrint("Starting") h3_level = 6 data_prefix = "VIIRS_ANNUAL_EOG" - + # Get list of nighttime lights VIIRS data # ntl_files = ntl.aws_search_ntl() ntl_folder = "/home/public/Data/GLOBAL/NighttimeLights/VIIRS_ANNUAL_EOG_V21" - ntl_files = [os.path.join(ntl_folder, x) for x in os.listdir(ntl_folder) if x.endswith(".tif")] - + ntl_files = [ + os.path.join(ntl_folder, x) + for x in os.listdir(ntl_folder) + if x.endswith(".tif") + ] + # h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False) admin_bounds = "/home/wb411133/data/Global/ADMIN/Admin2_Polys.shp" # Generate a list from the global admin boundaries inA = gpd.read_file(admin_bounds) - inA['id'] = list(inA.index) + inA["id"] = list(inA.index) h3_0_list = {} for region, countries in inA.groupby("WB_REGION"): h3_0_list[region] = countries - + if verbose: tPrint("H3_0 list generated") # set up mp arguments for h3_0_key, cur_gdf in h3_0_list.items(): arg_list = [] - processed_list = [] + processed_list = [] for pop_file in ntl_files: filename = os.path.basename(f'{pop_file.replace(".tif", "")}_zonal.csv') - out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - out_s3_key = f'Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) + out_s3_key = ( + f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_0_key}/{filename}" + ) + out_s3_key = f"Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}" + full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) try: tempPD = pd.read_csv(full_path) processed_list.append(filename) @@ -71,8 +81,8 @@ def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): arg_list.append([cur_gdf, pop_file, out_s3_key, True, verbose]) if multiprocess: - with multiprocessing.Pool(processes=min([70,len(ntl_files)])) as pool: - results = pool.starmap(run_zonal, arg_list) + with multiprocessing.Pool(processes=min([70, len(ntl_files)])) as pool: + results = pool.starmap(run_zonal, arg_list) else: for a in arg_list: results = run_zonal(*a) @@ -88,4 +98,4 @@ def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): "secret": AWS_SECRET_ACCESS_KEY, "token": AWS_SESSION_TOKEN, }, - ) \ No newline at end of file + ) diff --git a/notebooks/MP_SCRIPTS/zonal_fathom.py b/notebooks/MP_SCRIPTS/zonal_fathom.py index c174280..2d74ab8 100644 --- a/notebooks/MP_SCRIPTS/zonal_fathom.py +++ b/notebooks/MP_SCRIPTS/zonal_fathom.py @@ -1,22 +1,25 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys import pandas as pd -#import geopandas as gpd -#import numpy as np - from h3 import h3 +# import geopandas as gpd +# import numpy as np + + sys.path.insert(0, "/home/wb411133/Code/GOSTrocks/src") -import GOSTrocks.rasterMisc as rMisc -import GOSTrocks.ntlMisc as ntl import GOSTrocks.dataMisc as dMisc +import GOSTrocks.ntlMisc as ntl +import GOSTrocks.rasterMisc as rMisc from GOSTrocks.misc import tPrint sys.path.append("../../src") -import h3_helper import global_zonal +import h3_helper -AWS_S3_BUCKET = 'wbg-geography01' +AWS_S3_BUCKET = "wbg-geography01" AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") @@ -28,27 +31,35 @@ h3_level = 6 data_prefix_flood = "Flood" data_prefix_pop = "Flood_Pop" - flood_reclass_dict = { 0: [-9999, 0], - 1: [0, 10], - 2: [10.1, 50], - 3: [50, 100000.0],} - + flood_reclass_dict = { + 0: [-9999, 0], + 1: [0, 10], + 2: [10.1, 50], + 3: [50, 100000.0], + } + # Define input layers pop_layer = r"/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020ppp_2020_1km_Aggregated.tif" # Select layer to downlaod - flood_type = ["PLUVIAL","FLUVIAL","COASTAL"] + flood_type = ["PLUVIAL", "FLUVIAL", "COASTAL"] defence = ["DEFENDED"] - return_period = ['1in100'] + return_period = ["1in100"] climate_model = ["PERCENTILE50"] year = ["2020"] all_vrts = dMisc.get_fathom_vrts(True) - sel_images = all_vrts.loc[(all_vrts['FLOOD_TYPE'].isin(flood_type)) & (all_vrts['DEFENCE'].isin(defence)) & - (all_vrts['RETURN'].isin(return_period)) & (all_vrts['CLIMATE_MODEL'].isin(climate_model))] - fathom_vrt_path = sel_images['PATH'].iloc[0] + sel_images = all_vrts.loc[ + (all_vrts["FLOOD_TYPE"].isin(flood_type)) + & (all_vrts["DEFENCE"].isin(defence)) + & (all_vrts["RETURN"].isin(return_period)) + & (all_vrts["CLIMATE_MODEL"].isin(climate_model)) + ] + fathom_vrt_path = sel_images["PATH"].iloc[0] # h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False, read_pickle=True) - h3_1_list = h3_helper.generate_lvl1_lists(h3_level, return_gdf=True, buffer0=False, read_pickle=True) + h3_1_list = h3_helper.generate_lvl1_lists( + h3_level, return_gdf=True, buffer0=False, read_pickle=True + ) if verbose: tPrint("H3_1 list generated") # set up arguments for zonal processing @@ -56,30 +67,62 @@ flood_pop_args = [] for h3_1_key, cur_gdf in h3_1_list.items(): for fathom_index, fathom_row in sel_images.iterrows(): - fathom_path = fathom_row['PATH'] - fathom_file = "_".join([fathom_row['FLOOD_TYPE'], fathom_row['RETURN'], fathom_row['CLIMATE_MODEL'], fathom_row['YEAR']]) - - flood_pop_filename = f'FATHOM_total_pop_{fathom_file}.csv' - pop_out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{flood_pop_filename}' + fathom_path = fathom_row["PATH"] + fathom_file = "_".join( + [ + fathom_row["FLOOD_TYPE"], + fathom_row["RETURN"], + fathom_row["CLIMATE_MODEL"], + fathom_row["YEAR"], + ] + ) + + flood_pop_filename = f"FATHOM_total_pop_{fathom_file}.csv" + pop_out_s3_key = f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{flood_pop_filename}" full_path_pop = os.path.join("s3://", AWS_S3_BUCKET, pop_out_s3_key) try: - tempPD = pd.read_csv(full_path_pop) + tempPD = pd.read_csv(full_path_pop) except: - flood_pop_args.append([cur_gdf, "shape_id", pop_layer, fathom_path, pop_out_s3_key, - None, flood_reclass_dict, - True, 0, 10000000, verbose]) - total_flood_filename = f'FATHOM_total_depth_{fathom_file}.csv' - depth_out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{total_flood_filename}' + flood_pop_args.append( + [ + cur_gdf, + "shape_id", + pop_layer, + fathom_path, + pop_out_s3_key, + None, + flood_reclass_dict, + True, + 0, + 10000000, + verbose, + ] + ) + total_flood_filename = f"FATHOM_total_depth_{fathom_file}.csv" + depth_out_s3_key = f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{total_flood_filename}" full_path_depth = os.path.join("s3://", AWS_S3_BUCKET, depth_out_s3_key) try: - tempPD = pd.read_csv(full_path_depth) + tempPD = pd.read_csv(full_path_depth) except: - flood_depth_args.append([cur_gdf, "shape_id", fathom_path, depth_out_s3_key, True, 0, 1000, verbose]) + flood_depth_args.append( + [ + cur_gdf, + "shape_id", + fathom_path, + depth_out_s3_key, + True, + 0, + 1000, + verbose, + ] + ) tPrint("Arguments generated") # Multiprocess flood population results if multiprocess: - with multiprocessing.Pool(multiprocessing.cpu_count()-2) as pool: - pop_results = pool.starmap(global_zonal.zonal_stats_categorical, flood_pop_args) + with multiprocessing.Pool(multiprocessing.cpu_count() - 2) as pool: + pop_results = pool.starmap( + global_zonal.zonal_stats_categorical, flood_pop_args + ) else: pop_results = [] for a in flood_pop_args: @@ -101,8 +144,12 @@ # Multiprocess flood depth results if multiprocess: - with multiprocessing.Pool(processes=min([multiprocessing.cpu_count()-2,len(arg_list)])) as pool: - depth_results = pool.starmap(global_zonal.zonal_stats_numerical, flood_depth_args) + with multiprocessing.Pool( + processes=min([multiprocessing.cpu_count() - 2, len(arg_list)]) + ) as pool: + depth_results = pool.starmap( + global_zonal.zonal_stats_numerical, flood_depth_args + ) else: depth_results = [] for a in flood_depth_args: @@ -120,4 +167,4 @@ "secret": AWS_SECRET_ACCESS_KEY, "token": AWS_SESSION_TOKEN, }, - ) \ No newline at end of file + ) diff --git a/notebooks/MP_SCRIPTS/zonal_pop_NTL_VIIRS_LEN.py b/notebooks/MP_SCRIPTS/zonal_pop_NTL_VIIRS_LEN.py index 4667dd3..0bda932 100755 --- a/notebooks/MP_SCRIPTS/zonal_pop_NTL_VIIRS_LEN.py +++ b/notebooks/MP_SCRIPTS/zonal_pop_NTL_VIIRS_LEN.py @@ -1,69 +1,79 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys -import pandas as pd import geopandas as gpd -#import numpy as np - -from h3 import h3 - -import GOSTrocks.rasterMisc as rMisc import GOSTrocks.ntlMisc as ntl +import GOSTrocks.rasterMisc as rMisc +import pandas as pd from GOSTrocks.misc import tPrint +from h3 import h3 + +# import numpy as np + sys.path.append("../../src") import h3_helper -AWS_S3_BUCKET = 'wbg-geography01' +AWS_S3_BUCKET = "wbg-geography01" AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") + def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): - cName = f'{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}' + cName = f"{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}" if verbose: - tPrint(f'Starting {cName}') + tPrint(f"Starting {cName}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) + gdf["geometry"] = gdf["geometry"].buffer(0) res = rMisc.zonalStats(gdf, cur_raster_file, minVal=0, verbose=False) - res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) - res['id'] = gdf['id'].values + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["id"] = gdf["id"].values if verbose: - tPrint(f'**** finished {cName}') - return({out_file:res}) + tPrint(f"**** finished {cName}") + return {out_file: res} + if __name__ == "__main__": - multiprocess=True + multiprocess = True verbose = True tPrint("Starting") h3_level = 6 data_prefix = "VIIRS_ANNUAL_EOG" - + # Get list of nighttime lights VIIRS data # ntl_files = ntl.aws_search_ntl() ntl_folder = "/home/public/Data/GLOBAL/NighttimeLights/VIIRS_ANNUAL_EOG_V21" - ntl_files = [os.path.join(ntl_folder, x) for x in os.listdir(ntl_folder) if x.endswith(".tif")] - + ntl_files = [ + os.path.join(ntl_folder, x) + for x in os.listdir(ntl_folder) + if x.endswith(".tif") + ] + # h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False) admin_bounds = "/home/wb411133/data/Global/ADMIN/Admin2_Polys.shp" # Generate a list from the global admin boundaries inA = gpd.read_file(admin_bounds) - inA['id'] = list(inA.index) + inA["id"] = list(inA.index) h3_0_list = {} for region, countries in inA.groupby("WB_REGION"): h3_0_list[region] = countries - + if verbose: tPrint("H3_0 list generated") # set up mp arguments for h3_0_key, cur_gdf in h3_0_list.items(): arg_list = [] - processed_list = [] + processed_list = [] for pop_file in ntl_files: filename = os.path.basename(f'{pop_file.replace(".tif", "")}_zonal.csv') - out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - out_s3_key = f'Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) + out_s3_key = ( + f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_0_key}/{filename}" + ) + out_s3_key = f"Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}" + full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) try: tempPD = pd.read_csv(full_path) processed_list.append(filename) @@ -71,8 +81,8 @@ def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): arg_list.append([cur_gdf, pop_file, out_s3_key, True, verbose]) if multiprocess: - with multiprocessing.Pool(processes=min([70,len(ntl_files)])) as pool: - results = pool.starmap(run_zonal, arg_list) + with multiprocessing.Pool(processes=min([70, len(ntl_files)])) as pool: + results = pool.starmap(run_zonal, arg_list) else: for a in arg_list: results = run_zonal(*a) @@ -88,4 +98,4 @@ def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): "secret": AWS_SECRET_ACCESS_KEY, "token": AWS_SESSION_TOKEN, }, - ) \ No newline at end of file + ) diff --git a/notebooks/MP_SCRIPTS/zonal_pop_by_gender.py b/notebooks/MP_SCRIPTS/zonal_pop_by_gender.py index aa77aef..e48f1e6 100755 --- a/notebooks/MP_SCRIPTS/zonal_pop_by_gender.py +++ b/notebooks/MP_SCRIPTS/zonal_pop_by_gender.py @@ -1,86 +1,97 @@ -import sys, os, importlib, math, multiprocessing -import rasterio, geojson +import importlib +import math +import multiprocessing +import os +import sys -import pandas as pd +import geojson import geopandas as gpd +import GOSTrocks.rasterMisc as rMisc import numpy as np - +import pandas as pd +import rasterio +from GOSTrocks.misc import tPrint from h3 import h3 -from tqdm import tqdm from shapely.geometry import Polygon - -import GOSTrocks.rasterMisc as rMisc -from GOSTrocks.misc import tPrint +from tqdm import tqdm sys.path.append("../../src") import h3_helper -AWS_S3_BUCKET = 'wbg-geography01' +AWS_S3_BUCKET = "wbg-geography01" AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") + def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): - cName = f'{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}' + cName = f"{os.path.basename(os.path.dirname(out_file))}-{os.path.basename(cur_raster_file)}" if verbose: - tPrint(f'Starting {cName}') + tPrint(f"Starting {cName}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) + gdf["geometry"] = gdf["geometry"].buffer(0) res = rMisc.zonalStats(gdf, cur_raster_file, minVal=0, verbose=False) - res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) - res['id'] = gdf['id'].values + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["id"] = gdf["id"].values if verbose: - tPrint(f'**** finished {cName}') - return({out_file:res}) + tPrint(f"**** finished {cName}") + return {out_file: res} + if __name__ == "__main__": - multiprocess=True + multiprocess = True verbose = True tPrint("Starting") h3_level = 6 data_prefix = "WorldPop_2020_Demographics" - + admin_bounds = "/home/wb411133/data/Global/ADMIN/Admin2_Polys.shp" - - ''' + + """ global_urban = "/home/public/Data/GLOBAL/GHSL/SMOD/GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif" - ''' + """ # Define input raster variables - population_folder = "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/GLOBAL_1km_Demographics" - pop_files = [os.path.join(population_folder, x) for x in os.listdir(population_folder) if x.endswith("1km.tif")] + population_folder = ( + "/home/public/Data/GLOBAL/Population/WorldPop_PPP_2020/GLOBAL_1km_Demographics" + ) + pop_files = [ + os.path.join(population_folder, x) + for x in os.listdir(population_folder) + if x.endswith("1km.tif") + ] # h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False) - + # Generate a list from the global admin boundaries inA = gpd.read_file(admin_bounds) - inA['id'] = list(inA.index) + inA["id"] = list(inA.index) h3_0_list = {} for region, countries in inA.groupby("WB_REGION"): h3_0_list[region] = countries - + if verbose: - tPrint("H3_0 list generated") - + tPrint("H3_0 list generated") + # set up mp arguments for h3_0_key, cur_gdf in h3_0_list.items(): arg_list = [] - processed_list = [] + processed_list = [] for pop_file in pop_files: filename = os.path.basename(f'{pop_file.replace(".tif", "")}_zonal.csv') # out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - out_s3_key = f'Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}' - full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) - ''' + out_s3_key = f"Space2Stats/h3_stats_data/ADM_GLOBAL/{data_prefix}/{h3_0_key}/{filename}" + full_path = os.path.join("s3://", AWS_S3_BUCKET, out_s3_key) + """ try: tempPD = pd.read_csv(full_path) processed_list.append(filename) except: - ''' + """ arg_list.append([cur_gdf, pop_file, out_s3_key, True, verbose]) if multiprocess: - with multiprocessing.Pool(processes=min([70,len(pop_files)])) as pool: - results = pool.starmap(run_zonal, arg_list) + with multiprocessing.Pool(processes=min([70, len(pop_files)])) as pool: + results = pool.starmap(run_zonal, arg_list) else: for a in arg_list: results = run_zonal(*a) @@ -96,4 +107,4 @@ def run_zonal(gdf, cur_raster_file, out_file, buffer0=False, verbose=False): "secret": AWS_SECRET_ACCESS_KEY, "token": AWS_SESSION_TOKEN, }, - ) \ No newline at end of file + ) diff --git a/notebooks/MP_SCRIPTS/zonal_urbanization.py b/notebooks/MP_SCRIPTS/zonal_urbanization.py index 67f5e32..9b5b014 100644 --- a/notebooks/MP_SCRIPTS/zonal_urbanization.py +++ b/notebooks/MP_SCRIPTS/zonal_urbanization.py @@ -1,73 +1,83 @@ -import sys, os, multiprocessing +import multiprocessing +import os +import sys +import GOSTrocks.ntlMisc as ntl +import GOSTrocks.rasterMisc as rMisc import pandas as pd -#import geopandas as gpd -#import numpy as np - +from GOSTrocks.misc import tPrint from h3 import h3 -import GOSTrocks.rasterMisc as rMisc -import GOSTrocks.ntlMisc as ntl -from GOSTrocks.misc import tPrint +# import geopandas as gpd +# import numpy as np + sys.path.append("../../src") -import h3_helper import global_zonal +import h3_helper -AWS_S3_BUCKET = 'wbg-geography01' +AWS_S3_BUCKET = "wbg-geography01" AWS_ACCESS_KEY_ID = os.getenv("AWS_ACCESS_KEY_ID") AWS_SECRET_ACCESS_KEY = os.getenv("AWS_SECRET_ACCESS_KEY") AWS_SESSION_TOKEN = os.getenv("AWS_SESSION_TOKEN") if __name__ == "__main__": - multiprocess=True + multiprocess = True verbose = True run_urban = True run_urban_pop = False - + tPrint("Starting") h3_level = 6 data_prefix = "Urbanization" data_prefix_pop = "Urbanization_Pop" - + # Urbanization layers - unq_urban = [11,12,13,21,22,23,30] + unq_urban = [11, 12, 13, 21, 22, 23, 30] ghsl_folder = "/home/public/Data/GLOBAL/GHSL/" - ghs_smod = os.path.join(ghsl_folder, "SMOD", "GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif") - ghs_pop = os.path.join(ghsl_folder, "POP", "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0.tif") - - #h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False) - #if verbose: + ghs_smod = os.path.join( + ghsl_folder, "SMOD", "GHS_SMOD_E2020_GLOBE_R2023A_54009_1000_V1_0.tif" + ) + ghs_pop = os.path.join( + ghsl_folder, "POP", "GHS_POP_E2020_GLOBE_R2023A_54009_100_V1_0.tif" + ) + + # h3_0_list = h3_helper.generate_lvl0_lists(h3_level, return_gdf=True, buffer0=False) + # if verbose: # tPrint("H3_0 list generated") - - h3_1_list = h3_helper.generate_lvl1_lists(h3_level, return_gdf=True, buffer0=True, read_pickle=True, write_pickle=False) + + h3_1_list = h3_helper.generate_lvl1_lists( + h3_level, return_gdf=True, buffer0=True, read_pickle=True, write_pickle=False + ) if verbose: tPrint("H3_1 list generated") - + urban_pop_args = [] urban_args = [] for h3_1_key, cur_gdf in h3_1_list.items(): - # Set up mp arguments for urban population - pop_filename = 'GHS_POP_2020_Urban_Breakdown.csv' - pop_out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{pop_filename}' + # Set up mp arguments for urban population + pop_filename = "GHS_POP_2020_Urban_Breakdown.csv" + pop_out_s3_key = f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix_pop}/{h3_1_key}/{pop_filename}" pop_full_path = os.path.join("s3://", AWS_S3_BUCKET, pop_out_s3_key) try: tempPD = pd.read_csv(pop_full_path) except: - urban_pop_args.append([cur_gdf, "shape_id", ghs_pop, ghs_smod, pop_full_path, unq_urban]) - + urban_pop_args.append( + [cur_gdf, "shape_id", ghs_pop, ghs_smod, pop_full_path, unq_urban] + ) + # set up mp arguments for urban summary - urban_filename = 'GHS_SMOD_2020.csv' - urban_out_s3_key = f'Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_1_key}/{urban_filename}' + urban_filename = "GHS_SMOD_2020.csv" + urban_out_s3_key = f"Space2Stats/h3_stats_data/GLOBAL/{data_prefix}/{h3_1_key}/{urban_filename}" urban_full_path = os.path.join("s3://", AWS_S3_BUCKET, urban_out_s3_key) urban_args.append([cur_gdf, "shape_id", ghs_smod, unq_urban, urban_full_path]) - + if run_urban: tPrint(f"Running calculations on urban: {len(urban_args)} processes") # Run multi processing on urban if multiprocess: - with multiprocessing.Pool(processes=min([70,len(urban_args)])) as pool: - results = pool.starmap(global_zonal.zonal_stats_categories, urban_args) + with multiprocessing.Pool(processes=min([70, len(urban_args)])) as pool: + results = pool.starmap(global_zonal.zonal_stats_categories, urban_args) else: for a in arg_list: results = run_zonal(*a) @@ -87,8 +97,10 @@ if run_urban_pop: # Run multi processing on urban_pop_calculations if multiprocess: - with multiprocessing.Pool(processes=min([70,len(urban_pop_args)])) as pool: - results = pool.starmap(global_zonal.zonal_stats_categorical, urban_pop_args) + with multiprocessing.Pool(processes=min([70, len(urban_pop_args)])) as pool: + results = pool.starmap( + global_zonal.zonal_stats_categorical, urban_pop_args + ) else: for a in arg_list: results = run_zonal(*a) @@ -103,4 +115,4 @@ "secret": AWS_SECRET_ACCESS_KEY, "token": AWS_SESSION_TOKEN, }, - ) \ No newline at end of file + ) diff --git a/postgres/chunk_parquet.py b/postgres/chunk_parquet.py index f6806e6..30db620 100644 --- a/postgres/chunk_parquet.py +++ b/postgres/chunk_parquet.py @@ -3,14 +3,16 @@ import pandas as pd chunk_dir = "parquet_chunks" -df = pd.read_parquet('space2stats_updated.parquet') +df = pd.read_parquet("space2stats_updated.parquet") chunk_size = 100000 # Number of rows per chunk if not os.path.exists(chunk_dir): os.mkdir(chunk_dir) for i in range(0, len(df), chunk_size): - chunk = df.iloc[i:i + chunk_size] - chunk.to_parquet(os.path.join(chunk_dir, f'space2stats_part_{i // chunk_size}.parquet')) + chunk = df.iloc[i : i + chunk_size] + chunk.to_parquet( + os.path.join(chunk_dir, f"space2stats_part_{i // chunk_size}.parquet") + ) -print("Parquet file split into smaller chunks.") \ No newline at end of file +print("Parquet file split into smaller chunks.") diff --git a/postgres/nyc_sample.py b/postgres/nyc_sample.py index cdaddd1..185c19b 100644 --- a/postgres/nyc_sample.py +++ b/postgres/nyc_sample.py @@ -1,29 +1,30 @@ -import pandas as pd import h3 - +import pandas as pd # Load the full dataset -df = pd.read_parquet('space2stats.parquet') +df = pd.read_parquet("space2stats.parquet") # Define the bounding box for New York City (approximate values) as a GeoJSON polygon nyc_polygon = { "type": "Polygon", - "coordinates": [[ - [-74.259090, 40.477399], - [-73.700272, 40.477399], - [-73.700272, 40.917577], - [-74.259090, 40.917577], - [-74.259090, 40.477399] - ]] + "coordinates": [ + [ + [-74.259090, 40.477399], + [-73.700272, 40.477399], + [-73.700272, 40.917577], + [-74.259090, 40.917577], + [-74.259090, 40.477399], + ] + ], } # Generate H3 indices for the bounding box using polyfill -resolution = 6 +resolution = 6 nyc_hexagons = h3.polyfill(nyc_polygon, resolution, geo_json_conformant=True) # Filter the dataframe for New York City H3 indices -nyc_df = df[df['hex_id'].isin(nyc_hexagons)] +nyc_df = df[df["hex_id"].isin(nyc_hexagons)] -nyc_df.to_parquet('nyc_sample.parquet') +nyc_df.to_parquet("nyc_sample.parquet") -print("Filtered file for New York City.") \ No newline at end of file +print("Filtered file for New York City.") diff --git a/space2stats_api/cdk/app.py b/space2stats_api/cdk/app.py index fdf3e7d..a531357 100644 --- a/space2stats_api/cdk/app.py +++ b/space2stats_api/cdk/app.py @@ -5,8 +5,7 @@ settings = DeploymentSettings(_env_file="aws_deployment.env") env = Environment( - account=settings.CDK_DEFAULT_ACCOUNT, - region=settings.CDK_DEFAULT_REGION + account=settings.CDK_DEFAULT_ACCOUNT, region=settings.CDK_DEFAULT_REGION ) app = App() diff --git a/space2stats_api/cdk/aws_stack.py b/space2stats_api/cdk/aws_stack.py index 208f967..e0e3009 100644 --- a/space2stats_api/cdk/aws_stack.py +++ b/space2stats_api/cdk/aws_stack.py @@ -16,38 +16,40 @@ def __init__(self, scope: Construct, id: str, **kwargs) -> None: deployment_settings = DeploymentSettings(_env_file="./aws_deployment.env") lambda_function = PythonFunction( - self, "Space2StatsFunction", + self, + "Space2StatsFunction", entry="../src", runtime=_lambda.Runtime.PYTHON_3_11, index="space2stats/handler.py", timeout=Duration.seconds(120), handler="handler", environment=app_settings.model_dump(), - memory_size=1024 + memory_size=1024, ) certificate = acm.Certificate.from_certificate_arn( - self, "Certificate", - deployment_settings.CDK_CERTIFICATE_ARN + self, "Certificate", deployment_settings.CDK_CERTIFICATE_ARN ) domain_name = apigatewayv2.DomainName( - self, "DomainName", + self, + "DomainName", domain_name=deployment_settings.CDK_DOMAIN_NAME, - certificate=certificate + certificate=certificate, ) http_api = apigatewayv2.HttpApi( - self, "Space2StatsHttpApi", + self, + "Space2StatsHttpApi", default_integration=integrations.HttpLambdaIntegration( - "LambdaIntegration", - handler=lambda_function - ) + "LambdaIntegration", handler=lambda_function + ), ) apigatewayv2.ApiMapping( - self, "ApiMapping", + self, + "ApiMapping", api=http_api, domain_name=domain_name, - stage=http_api.default_stage + stage=http_api.default_stage, ) diff --git a/space2stats_api/cdk/settings.py b/space2stats_api/cdk/settings.py index fd22267..706189c 100644 --- a/space2stats_api/cdk/settings.py +++ b/space2stats_api/cdk/settings.py @@ -9,6 +9,7 @@ class AppSettings(BaseSettings): DB_PASSWORD: str DB_TABLE_NAME: str + class DeploymentSettings(BaseSettings): CDK_DEFAULT_ACCOUNT: str CDK_DEFAULT_REGION: str diff --git a/space2stats_api/src/space2stats/__main__.py b/space2stats_api/src/space2stats/__main__.py index edd875a..feb354d 100644 --- a/space2stats_api/src/space2stats/__main__.py +++ b/space2stats_api/src/space2stats/__main__.py @@ -1,4 +1,3 @@ - import os from .app import app @@ -11,8 +10,9 @@ if __name__ == "__main__": - - assert uvicorn is not None, "uvicorn must be installed: `python -m pip install 'space2stats[server]'`" + assert ( + uvicorn is not None + ), "uvicorn must be installed: `python -m pip install 'space2stats[server]'`" uvicorn.run( app=app, diff --git a/space2stats_api/src/space2stats/app.py b/space2stats_api/src/space2stats/app.py index f60b19f..ac640ba 100644 --- a/space2stats_api/src/space2stats/app.py +++ b/space2stats_api/src/space2stats/app.py @@ -1,16 +1,15 @@ from contextlib import asynccontextmanager +from typing import Any, Dict, List from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import ORJSONResponse - -from typing import Any, Dict, List - from starlette.requests import Request from starlette_cramjam.middleware import CompressionMiddleware -from .db import connect_to_db, close_db_connection -from .main import get_summaries_from_geom, get_available_fields, SummaryRequest +from .db import close_db_connection, connect_to_db +from .main import SummaryRequest, get_available_fields, get_summaries_from_geom + @asynccontextmanager async def lifespan(app: FastAPI): diff --git a/space2stats_api/src/space2stats/handler.py b/space2stats_api/src/space2stats/handler.py index 8884899..e9a7c33 100644 --- a/space2stats_api/src/space2stats/handler.py +++ b/space2stats_api/src/space2stats/handler.py @@ -5,8 +5,8 @@ from mangum import Mangum -from .db import connect_to_db from .app import app +from .db import connect_to_db @app.on_event("startup") diff --git a/space2stats_api/src/space2stats/main.py b/space2stats_api/src/space2stats/main.py index 93ad60e..2141c9d 100644 --- a/space2stats_api/src/space2stats/main.py +++ b/space2stats_api/src/space2stats/main.py @@ -1,21 +1,19 @@ - - - from typing import Dict, List, Literal, Optional import psycopg as pg +from geojson_pydantic import Feature, Polygon from psycopg import Connection from pydantic import BaseModel -from geojson_pydantic import Feature, Polygon from typing_extensions import TypeAlias -from .h3_utils import generate_h3_ids, generate_h3_geometries +from .h3_utils import generate_h3_geometries, generate_h3_ids from .settings import Settings settings = Settings() AoiModel: TypeAlias = Feature[Polygon, Dict] + class SummaryRequest(BaseModel): aoi: AoiModel spatial_join_method: Literal["touches", "centroid", "within"] @@ -74,9 +72,7 @@ def get_summaries_from_geom( # Format Summaries summaries: List[Dict] = [] - geometries = ( - generate_h3_geometries(h3_ids, geometry) if geometry else None - ) + geometries = generate_h3_geometries(h3_ids, geometry) if geometry else None for idx, row in enumerate(rows): summary = {"hex_id": row[0]} diff --git a/space2stats_api/src/tests/test_api.py b/space2stats_api/src/tests/test_api.py index 3d87a2a..75dabe7 100644 --- a/space2stats_api/src/tests/test_api.py +++ b/space2stats_api/src/tests/test_api.py @@ -1,10 +1,8 @@ from unittest.mock import patch import pytest -from pytest_postgresql.janitor import DatabaseJanitor - from fastapi.testclient import TestClient - +from pytest_postgresql.janitor import DatabaseJanitor aoi = { "type": "Feature", @@ -23,6 +21,7 @@ "properties": {}, } + @pytest.fixture(scope="session") def database(postgresql_proc): """Fake Database.""" @@ -62,7 +61,7 @@ def test_read_root(client): def test_get_summary(mock_get_summaries, client): mock_get_summaries.return_value = ( [("hex_1", 100, 200)], - ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"] + ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"], ) request_payload = { @@ -89,7 +88,7 @@ def test_get_summary(mock_get_summaries, client): def test_get_summary_with_geometry_polygon(mock_get_summaries, client): mock_get_summaries.return_value = ( [("hex_1", 100, 200)], - ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"] + ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"], ) request_payload = { @@ -119,7 +118,7 @@ def test_get_summary_with_geometry_polygon(mock_get_summaries, client): def test_get_summary_with_geometry_point(mock_get_summaries, client): mock_get_summaries.return_value = ( [("hex_1", 100, 200)], - ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"] + ["hex_id", "sum_pop_2020", "sum_pop_f_10_2020"], ) request_payload = { @@ -147,9 +146,11 @@ def test_get_summary_with_geometry_point(mock_get_summaries, client): @patch("space2stats.app.get_available_fields") def test_get_fields(mock_get_available_fields, client): - mock_get_available_fields.return_value = ["sum_pop_2020", - "sum_pop_f_10_2020", - "field3"] + mock_get_available_fields.return_value = [ + "sum_pop_2020", + "sum_pop_f_10_2020", + "field3", + ] response = client.get("/fields") diff --git a/space2stats_api/src/tests/test_h3_utils.py b/space2stats_api/src/tests/test_h3_utils.py index b7b8082..04881d9 100644 --- a/space2stats_api/src/tests/test_h3_utils.py +++ b/space2stats_api/src/tests/test_h3_utils.py @@ -1,6 +1,5 @@ import pytest from shapely.geometry import Polygon, mapping - from space2stats.h3_utils import generate_h3_geometries, generate_h3_ids polygon_coords = [ diff --git a/src/country_zonal.py b/src/country_zonal.py index ed7ee08..7f838ee 100755 --- a/src/country_zonal.py +++ b/src/country_zonal.py @@ -1,29 +1,42 @@ -import sys, os, importlib, json -import folium, shapely, rasterio, matplotlib +import importlib +import json +import os +import sys +from urllib.request import urlopen import contextily as ctx -import matplotlib.pyplot as plt -import matplotlib.patches as mpatches -import pandas as pd +import folium import geopandas as gpd +import GOSTRocks.ntlMisc as ntl +import GOSTRocks.rasterMisc as rMisc +import matplotlib +import matplotlib.patches as mpatches +import matplotlib.pyplot as plt import numpy as np - -from rasterio.crs import CRS -from mpl_toolkits.axes_grid1 import make_axes_locatable +import pandas as pd +import rasterio +import shapely +from GOSTRocks.misc import tPrint from h3 import h3 -from shapely.geometry import Polygon, Point, mapping +from mpl_toolkits.axes_grid1 import make_axes_locatable +from rasterio.crs import CRS +from shapely.geometry import Point, Polygon, mapping from shapely.ops import unary_union -from urllib.request import urlopen from tqdm import tqdm import h3_helper -import GOSTRocks.rasterMisc as rMisc -import GOSTRocks.ntlMisc as ntl -from GOSTRocks.misc import tPrint -def calculate_value(in_shp, zonal_res, h3_level, feat_id, fractional_res=True, - zonal_res_id='shape_id', default_sum='SUM'): - ''' tabulate hexabin stats for all bins that intersect shape in_shp + +def calculate_value( + in_shp, + zonal_res, + h3_level, + feat_id, + fractional_res=True, + zonal_res_id="shape_id", + default_sum="SUM", +): + """tabulate hexabin stats for all bins that intersect shape in_shp :param in_shp: shape of boundary to intersect with hexabins :type in_shp: shapely polygon @@ -42,25 +55,29 @@ def calculate_value(in_shp, zonal_res, h3_level, feat_id, fractional_res=True, :return: dictionary of results summarized based on type (SUM, MIN, MEAN, MAX) :rtype: Dictionary - ''' + """ + def get_intersection(admin_shp, hex_shp): - ''' get fraction of hex_shp that is inside admin_shp - ''' + """get fraction of hex_shp that is inside admin_shp""" if admin_shp.contains(hex_shp): - return(1) + return 1 else: - return(admin_shp.intersection(hex_shp).area/hex_shp.area) - - res = {'id':feat_id} + return admin_shp.intersection(hex_shp).area / hex_shp.area + + res = {"id": feat_id} process_h3 = True # Generate h3 cells that intersect current shape; if none are generated first time through, buffer # the geometry by a little bit, and then search again - while process_h3: + while process_h3: if in_shp.geom_type == "Polygon": - sel_h3 = h3.polyfill(in_shp.__geo_interface__, h3_level, geo_json_conformant=True) + sel_h3 = h3.polyfill( + in_shp.__geo_interface__, h3_level, geo_json_conformant=True + ) elif in_shp.geom_type == "MultiPolygon": for cPoly in in_shp: - temp_h3 = h3.polyfill(cPoly.__geo_interface__, h3_level, geo_json_conformant=True) + temp_h3 = h3.polyfill( + cPoly.__geo_interface__, h3_level, geo_json_conformant=True + ) try: sel_h3 = sel_h3.union(temp_h3) except: @@ -70,45 +87,67 @@ def get_intersection(admin_shp, hex_shp): if len(sel_h3) > 0: hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - all_polys = gpd.GeoSeries(list(map(hex_poly, sel_h3)), index=sel_h3, crs="EPSG:4326") - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) - all_polys['shape_id'] = list(all_polys.index) + all_polys = gpd.GeoSeries( + list(map(hex_poly, sel_h3)), index=sel_h3, crs="EPSG:4326" + ) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) + all_polys["shape_id"] = list(all_polys.index) if fractional_res: - all_polys['inter_area'] = all_polys['geometry'].apply(lambda x: get_intersection(in_shp, x)) + all_polys["inter_area"] = all_polys["geometry"].apply( + lambda x: get_intersection(in_shp, x) + ) else: - all_polys['inter_area'] = 1 - all_polys = pd.merge(all_polys, zonal_res, left_on='shape_id', right_on=zonal_res_id) - for col in all_polys.columns: - if not col in ['inter_area','geometry','shape_id']: + all_polys["inter_area"] = 1 + all_polys = pd.merge( + all_polys, zonal_res, left_on="shape_id", right_on=zonal_res_id + ) + for col in all_polys.columns: + if not col in ["inter_area", "geometry", "shape_id"]: calc_type = default_sum - if "SUM" in col: calc_type = "SUM" - if "MIN" in col: calc_type = "MIN" - if "MAX" in col: calc_type = "MAX" - if "MEAN" in col: calc_type = "MEAN" + if "SUM" in col: + calc_type = "SUM" + if "MIN" in col: + calc_type = "MIN" + if "MAX" in col: + calc_type = "MAX" + if "MEAN" in col: + calc_type = "MEAN" try: - if calc_type == "SUM": # For sum columns, multiply column by inter_area and sum results - cur_val = sum(all_polys[col] * all_polys['inter_area']) + if ( + calc_type == "SUM" + ): # For sum columns, multiply column by inter_area and sum results + cur_val = sum(all_polys[col] * all_polys["inter_area"]) elif calc_type == "MIN": cur_val = all_polys[col].min() elif calc_type == "MAX": cur_val = all_polys[col].max() elif calc_type == "MEAN": - cur_val = sum(all_polys[col] * all_polys['inter_area'])/sum(all_polys['inter_area']) + cur_val = sum(all_polys[col] * all_polys["inter_area"]) / sum( + all_polys["inter_area"] + ) res[col] = cur_val except: pass try: - del(cur_val) + del cur_val except: pass else: pass - return(res) + return res + + +def connect_polygons_h3_stats( + inA, + stats_df, + h3_level, + id_col, + fractional_res=True, + zonal_res_id="shape_id", + default_sum="SUM", +): + """merge stats from hexabin stats dataframe (stats_df) with the inA geodataframe -def connect_polygons_h3_stats(inA, stats_df, h3_level, id_col, fractional_res=True, - zonal_res_id='shape_id', default_sum='SUM'): - ''' merge stats from hexabin stats dataframe (stats_df) with the inA geodataframe - :param inA: input boundary dataset :type inA: geopandas.GeoDataFrame :param stats_df: input hexabin stats dataset @@ -123,180 +162,214 @@ def connect_polygons_h3_stats(inA, stats_df, h3_level, id_col, fractional_res=Tr function sill pick up [SUM,MIN,MAX,MEAN], defaults to sum :type default_sum: string, optional - + :return: pandas dataframe with attached statistics and matching id from id_col :rtype: geopandas.GeoDataFrame - ''' + """ all_res = [] - for idx, row in inA.iterrows(): + for idx, row in inA.iterrows(): try: - all_res.append(calculate_value(row['geometry'], stats_df, h3_level, row[id_col], fractional_res, zonal_res_id, default_sum)) + all_res.append( + calculate_value( + row["geometry"], + stats_df, + h3_level, + row[id_col], + fractional_res, + zonal_res_id, + default_sum, + ) + ) except: - print(f'Error processing {idx}') - - return(pd.DataFrame(all_res)) - -class country_h3_zonal(): - ''' Generate h3 grid at prescribed level; intersect with admin boundary; run zonal stats - - :param iso3: Country ISO3 code - :type iso3: string - :param adm_bounds: admin boundaries for joining with h3 grid - :type adm_bounds: geopandas.GeoDataFrame - :param adm_bounds_id: column in adm_bounds used as unique ID - :type adm_bounds_id: string - :param h3_level: size of h3 grid to create; we suggest starting with 6 or 5 (5 is larger) - :type h3_level: int - ''' - def __init__(self, iso3, adm_bounds, adm_bounds_id, h3_level, out_folder, h3_grid = ''): + print(f"Error processing {idx}") + + return pd.DataFrame(all_res) + + +class country_h3_zonal: + """Generate h3 grid at prescribed level; intersect with admin boundary; run zonal stats + + :param iso3: Country ISO3 code + :type iso3: string + :param adm_bounds: admin boundaries for joining with h3 grid + :type adm_bounds: geopandas.GeoDataFrame + :param adm_bounds_id: column in adm_bounds used as unique ID + :type adm_bounds_id: string + :param h3_level: size of h3 grid to create; we suggest starting with 6 or 5 (5 is larger) + :type h3_level: int + """ + + def __init__( + self, iso3, adm_bounds, adm_bounds_id, h3_level, out_folder, h3_grid="" + ): self.iso3 = iso3 self.adm_bounds = adm_bounds self.adm_bounds_id = adm_bounds_id self.h3_level = h3_level self.out_folder = out_folder - - #define output variables - if h3_grid != '': - self.out_h3_grid = os.path.join(out_folder, f'h3_level_{h3_level}.geojson') + + # define output variables + if h3_grid != "": + self.out_h3_grid = os.path.join(out_folder, f"h3_level_{h3_level}.geojson") else: self.out_h3_grid = h3_grid try: self.h3_cells = gpd.read_file(self.out_h3_grid) except: pass - self.out_admin = os.path.join(out_folder, 'admin_bounds.geojson') + self.out_admin = os.path.join(out_folder, "admin_bounds.geojson") try: self.adm_bounds_h3 = gpd.read_file(self.out_admin) except: pass - - + def generate_h3_grid(self, cols_to_include=[], attach_admin=False): - ''' Generate the h3 grid and join to the admin boundaries - - :param cols_to_include: list of columns to include from adm_bounds in joined output - :type cols_to_include: list of strings - ''' + """Generate the h3 grid and join to the admin boundaries + + :param cols_to_include: list of columns to include from adm_bounds in joined output + :type cols_to_include: list of strings + """ selA = self.adm_bounds try: - return(self.h3_cells) - except: + return self.h3_cells + except: pass - + try: h3_cells = self.h3_cells.copy() - h3_cells = h3_cells.loc[:,['shape_id','geometry']] + h3_cells = h3_cells.loc[:, ["shape_id", "geometry"]] except: h3_cells = h3_helper.generate_h3_gdf(self.adm_bounds, self.h3_level) - - h3_cells['centroid'] = h3_cells['geometry'].apply(lambda x: x.centroid) - h3_centroids = h3_cells.set_geometry('centroid') + + h3_cells["centroid"] = h3_cells["geometry"].apply(lambda x: x.centroid) + h3_centroids = h3_cells.set_geometry("centroid") cols_to_include.append("geometry") cols_to_include.append(self.adm_bounds_id) cols_to_include = list(set(cols_to_include)) - h3_joined = gpd.sjoin(h3_centroids, selA.loc[:,cols_to_include], how='left') - if attach_admin: - h3_pivot = pd.pivot_table(h3_joined, index=self.adm_bounds_id, aggfunc={cols_to_include[0]:len}) - h3_pivot.columns = [*h3_pivot.columns[:-1], 'h3_count'] + h3_joined = gpd.sjoin(h3_centroids, selA.loc[:, cols_to_include], how="left") + if attach_admin: + h3_pivot = pd.pivot_table( + h3_joined, index=self.adm_bounds_id, aggfunc={cols_to_include[0]: len} + ) + h3_pivot.columns = [*h3_pivot.columns[:-1], "h3_count"] h3_pivot = h3_pivot.reset_index() - h3_pivot = selA.loc[:,cols_to_include].merge(h3_pivot, how='left', on=self.adm_bounds_id) + h3_pivot = selA.loc[:, cols_to_include].merge( + h3_pivot, how="left", on=self.adm_bounds_id + ) self.adm_bounds_h3 = h3_pivot - h3_joined = h3_joined.set_geometry("geometry").drop(['centroid'], axis=1) + h3_joined = h3_joined.set_geometry("geometry").drop(["centroid"], axis=1) h3_joined = h3_joined.reset_index() self.h3_cells = h3_joined - - return(h3_joined) - + + return h3_joined + def summarize_adm_h3_join(self, verbose=False): - ''' Summarize the join between the adm bounds and the h3 grid: - 1. Number of h3 cells - 2. Number of adm bounds - 3. Number of adm bounds with 0 h3 centroids - 4. Number of adm bounds with 0 - 1 h3 centroids - 5. Number of adm bounds with 2 - 5 h3 centroids - ''' + """Summarize the join between the adm bounds and the h3 grid: + 1. Number of h3 cells + 2. Number of adm bounds + 3. Number of adm bounds with 0 h3 centroids + 4. Number of adm bounds with 0 - 1 h3 centroids + 5. Number of adm bounds with 2 - 5 h3 centroids + """ try: inD = self.adm_bounds_h3.copy() except: self.generate_h3_grid() inD = self.adm_bounds_h3.copy() - + n_h3 = self.h3_cells.shape[0] n_adm = inD.shape[0] - n_adm_0 = inD.loc[inD['h3_count'].isna()].shape[0] - n_adm_1 = inD.loc[inD['h3_count'] == 1].shape[0] - n_adm_2 = inD.loc[(inD['h3_count'] < 6) & (inD['h3_count'] > 1)].shape[0] - + n_adm_0 = inD.loc[inD["h3_count"].isna()].shape[0] + n_adm_1 = inD.loc[inD["h3_count"] == 1].shape[0] + n_adm_2 = inD.loc[(inD["h3_count"] < 6) & (inD["h3_count"] > 1)].shape[0] + if verbose: - tPrint(f"{self.iso3}: H3 [{n_h3}], ADM [{n_adm}], ADM0 [{n_adm_0}], ADM1 [{n_adm_1}], ADM2 [{n_adm_2}]") - return([n_h3, n_adm, n_adm_0, n_adm_1, n_adm_2]) - + tPrint( + f"{self.iso3}: H3 [{n_h3}], ADM [{n_adm}], ADM0 [{n_adm_0}], ADM1 [{n_adm_1}], ADM2 [{n_adm_2}]" + ) + return [n_h3, n_adm, n_adm_0, n_adm_1, n_adm_2] + def write_output(self, write_h3=True, write_admin=False): - ''' write geospatial data to disk - - ''' + """write geospatial data to disk""" if write_h3: self.h3_cells.to_file(self.out_h3_grid, driver="GeoJSON") if write_admin: self.adm_bounds_h3.to_file(self.out_admin, driver="GeoJSON") - def zonal_raster(self, in_raster, minVal='', maxVal='', all_touched=False, weighted=False): - ''' + def zonal_raster( + self, in_raster, minVal="", maxVal="", all_touched=False, weighted=False + ): + """ - :param in_raster: string path to raster file for calculations - :type in_raster: string - :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold - :type minVal: numeric - ''' + :param in_raster: string path to raster file for calculations + :type in_raster: string + :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold + :type minVal: numeric + """ h3_grid = self.generate_h3_grid() - + if isinstance(in_raster, str): - inR = rasterio.open(in_raster, 'r') + inR = rasterio.open(in_raster, "r") else: inR = in_raster - + # Run zonal statistics on pop_raster - res = rMisc.zonalStats(h3_grid, inR, reProj=True, minVal=minVal, maxVal=maxVal, - allTouched=all_touched, weighted=weighted) - res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) - res['shape_id'] = h3_grid['shape_id'].astype(object) - return(res) - - - - def zonal_raster_population(self, in_raster, pop_raster, raster_thresh, thresh_label='thresh', - resampling_type="sum", minVal='', maxVal='', all_touched=False, weighted=False): - ''' extract raster data from in_raster, urban_raster for selected country, standardize urban_raster to in_raster - - :param in_raster: string path to raster file for calculations - :type in_raster: string - :param pop_raster: string path to population file for summarizing calculations - :type pop_raster: string - :param raster_thresh: value to threshold in_raster in order to summarize population - :type raster_thresh: number - - :param thresh_label: label to append to thresholded summaries in output table, default is to 'thresh' - :type thresh_label: string - :param resampling_type: how to re-sample in_raster to pop_raster, using rasterio resampling options, default is to 'SUM' - :type resampling_type: string - :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold - :type minVal: numeric - :param urban_mask_val: list of values in urban_raster to be used for mask - :type urban_mask_val: list of int - :param unqVals: - :type unqVals: - - - ''' + res = rMisc.zonalStats( + h3_grid, + inR, + reProj=True, + minVal=minVal, + maxVal=maxVal, + allTouched=all_touched, + weighted=weighted, + ) + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["shape_id"] = h3_grid["shape_id"].astype(object) + return res + + def zonal_raster_population( + self, + in_raster, + pop_raster, + raster_thresh, + thresh_label="thresh", + resampling_type="sum", + minVal="", + maxVal="", + all_touched=False, + weighted=False, + ): + """extract raster data from in_raster, urban_raster for selected country, standardize urban_raster to in_raster + + :param in_raster: string path to raster file for calculations + :type in_raster: string + :param pop_raster: string path to population file for summarizing calculations + :type pop_raster: string + :param raster_thresh: value to threshold in_raster in order to summarize population + :type raster_thresh: number + + :param thresh_label: label to append to thresholded summaries in output table, default is to 'thresh' + :type thresh_label: string + :param resampling_type: how to re-sample in_raster to pop_raster, using rasterio resampling options, default is to 'SUM' + :type resampling_type: string + :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold + :type minVal: numeric + :param urban_mask_val: list of values in urban_raster to be used for mask + :type urban_mask_val: list of int + :param unqVals: + :type unqVals: + + + """ h3_grid = self.generate_h3_grid() - + if isinstance(in_raster, str): - inR = rasterio.open(in_raster, 'r') + inR = rasterio.open(in_raster, "r") else: inR = in_raster if isinstance(pop_raster, str): - popR = rasterio.open(pop_raster, 'r') + popR = rasterio.open(pop_raster, "r") else: popR = pop_raster @@ -308,90 +381,148 @@ def zonal_raster_population(self, in_raster, pop_raster, raster_thresh, thresh_l with rMisc.create_rasterio_inmemory(profile1, inN) as tempR: # Run zonal statistics on pop_raster - res = rMisc.zonalStats(h3_grid, tempR, reProj=True, minVal=minVal, maxVal=maxVal, - allTouched=all_touched, weighted=weighted) - res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) - res['shape_id'] = h3_grid['shape_id'].astype(object) + res = rMisc.zonalStats( + h3_grid, + tempR, + reProj=True, + minVal=minVal, + maxVal=maxVal, + allTouched=all_touched, + weighted=weighted, + ) + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["shape_id"] = h3_grid["shape_id"].astype(object) # Standardize in_raster to pop_raster with rMisc.create_rasterio_inmemory(profile2, inD) as tempD: - inD, profile2 = rMisc.standardizeInputRasters(tempD, tempR, resampling_type=resampling_type) + inD, profile2 = rMisc.standardizeInputRasters( + tempD, tempR, resampling_type=resampling_type + ) # threhsold in raster to create binary - inR_thresh = (inD >= raster_thresh) + inR_thresh = inD >= raster_thresh pop_thresh = inN * inR_thresh - + # Summarize thresholded populatino with rMisc.create_rasterio_inmemory(profile1, pop_thresh) as urbanR: - resU = rMisc.zonalStats(h3_grid, urbanR, reProj=True, minVal=minVal, maxVal=maxVal, - allTouched=all_touched, weighted=weighted) - resU = pd.DataFrame(resU, columns=[f"SUM_{thresh_label}", f"MIN_{thresh_label}", f"MAX_{thresh_label}", f"MEAN_{thresh_label}"]) + resU = rMisc.zonalStats( + h3_grid, + urbanR, + reProj=True, + minVal=minVal, + maxVal=maxVal, + allTouched=all_touched, + weighted=weighted, + ) + resU = pd.DataFrame( + resU, + columns=[ + f"SUM_{thresh_label}", + f"MIN_{thresh_label}", + f"MAX_{thresh_label}", + f"MEAN_{thresh_label}", + ], + ) resU = resU.astype(float) - resU['shape_id'] = h3_grid['shape_id'] - res_final = res.merge(resU, on='shape_id') + resU["shape_id"] = h3_grid["shape_id"] + res_final = res.merge(resU, on="shape_id") return res_final + def zonal_raster_urban( + self, + in_raster, + urban_raster, + resampling_type="nearest", + minVal="", + maxVal="", + rastType="N", + urban_mask_val=[21, 22, 23, 30], + unqVals=[], + all_touched=False, + weighted=False, + ): + """extract raster data from in_raster, urban_raster for selected country, standardize urban_raster to in_raster + :param in_raster: string path to raster file for calculations + :type in_raster: string + :param in_raster: string path to urban file tiering calculations + :type in_raster: string + :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold + :type minVal: numeric + :param rastType: define the input data in the in_raster. Options are N (for numeric, default) or C (categorical) + :type rastType: string + :param urban_mask_val: list of values in urban_raster to be used for mask + :type urban_mask_val: list of int + :param unqVals: + :type unqVals: - def zonal_raster_urban(self, in_raster, urban_raster, resampling_type="nearest", minVal='', maxVal='', rastType='N', - urban_mask_val=[21,22,23,30], unqVals=[], all_touched=False, weighted=False): - ''' extract raster data from in_raster, urban_raster for selected country, standardize urban_raster to in_raster - - :param in_raster: string path to raster file for calculations - :type in_raster: string - :param in_raster: string path to urban file tiering calculations - :type in_raster: string - :param minVal: minimum value in in_raster to pass to zonal function; everything below is considered 0. Default is no threshold - :type minVal: numeric - :param rastType: define the input data in the in_raster. Options are N (for numeric, default) or C (categorical) - :type rastType: string - :param urban_mask_val: list of values in urban_raster to be used for mask - :type urban_mask_val: list of int - :param unqVals: - :type unqVals: - - :return: dictionary of rasterio objects for in_raster and urban_raster - :rtype: dictionary of 'in_raster': rasterio.DatasetReader, 'urban_raster': rasterio.DatasetReader - ''' + :return: dictionary of rasterio objects for in_raster and urban_raster + :rtype: dictionary of 'in_raster': rasterio.DatasetReader, 'urban_raster': rasterio.DatasetReader + """ h3_grid = self.generate_h3_grid() - + if isinstance(in_raster, str): - inR = rasterio.open(in_raster, 'r') + inR = rasterio.open(in_raster, "r") else: inR = in_raster - + if isinstance(urban_raster, str): - inU = rasterio.open(urban_raster, 'r') + inU = rasterio.open(urban_raster, "r") else: inU = urban_raster # Clip inR to extent of country inN, profile1 = rMisc.clipRaster(inR, self.adm_bounds, crop=False) with rMisc.create_rasterio_inmemory(profile1, inN) as tempR: - if rastType == 'N': + if rastType == "N": # Run zonal statistics on in_raster - res = rMisc.zonalStats(h3_grid, tempR, rastType=rastType, reProj=True, minVal=minVal, maxVal=maxVal, - allTouched=all_touched, weighted=weighted) - res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) - res['shape_id'] = h3_grid['shape_id'].astype(object) + res = rMisc.zonalStats( + h3_grid, + tempR, + rastType=rastType, + reProj=True, + minVal=minVal, + maxVal=maxVal, + allTouched=all_touched, + weighted=weighted, + ) + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["shape_id"] = h3_grid["shape_id"].astype(object) # Standardize in_urban raster to clippedR - outU, profile2 = rMisc.standardizeInputRasters(inU, tempR, resampling_type=resampling_type) + outU, profile2 = rMisc.standardizeInputRasters( + inU, tempR, resampling_type=resampling_type + ) # Isolate values in in_raster that are urban - inN_urban = np.isin(outU, urban_mask_val) + inN_urban = np.isin(outU, urban_mask_val) with rMisc.create_rasterio_inmemory(profile2, inN_urban) as urbanR: - resU = rMisc.zonalStats(h3_grid, urbanR, rastType=rastType, reProj=True, minVal=minVal, maxVal=maxVal, - allTouched=all_touched, weighted=weighted) - resU = pd.DataFrame(resU, columns=["SUM_urban", "MIN_urban", "MAX_urban", "MEAN_urban"]) + resU = rMisc.zonalStats( + h3_grid, + urbanR, + rastType=rastType, + reProj=True, + minVal=minVal, + maxVal=maxVal, + allTouched=all_touched, + weighted=weighted, + ) + resU = pd.DataFrame( + resU, + columns=["SUM_urban", "MIN_urban", "MAX_urban", "MEAN_urban"], + ) resU = resU.astype(float) - resU['shape_id'] = h3_grid['shape_id'] - res_final = res.merge(resU, on='shape_id') - elif rastType == 'C': + resU["shape_id"] = h3_grid["shape_id"] + res_final = res.merge(resU, on="shape_id") + elif rastType == "C": # Run zonal statistics on in_raster - res_final = rMisc.zonalStats(h3_grid, tempR, rastType=rastType, reProj=True, unqVals=unqVals, - allTouched=all_touched, weighted=weighted) - res_final = pd.DataFrame(res_final, columns=[f'c_x' for x in unqVals]) - res_final['shape_id'] = h3_grid['shape_id'].astype(object) - - return(res_final) - - - \ No newline at end of file + res_final = rMisc.zonalStats( + h3_grid, + tempR, + rastType=rastType, + reProj=True, + unqVals=unqVals, + allTouched=all_touched, + weighted=weighted, + ) + res_final = pd.DataFrame(res_final, columns=[f"c_x" for x in unqVals]) + res_final["shape_id"] = h3_grid["shape_id"].astype(object) + + return res_final diff --git a/src/global_zonal.py b/src/global_zonal.py index cfeba2f..31aceed 100755 --- a/src/global_zonal.py +++ b/src/global_zonal.py @@ -1,23 +1,30 @@ -import boto3, os +import os +from urllib.request import urlopen -import pandas as pd +import boto3 import geopandas as gpd +import GOSTrocks.ntlMisc as ntl +import GOSTrocks.rasterMisc as rMisc import numpy as np - -from rasterio.crs import CRS +import pandas as pd +from GOSTrocks.misc import tPrint from h3 import h3 -from shapely.geometry import Polygon, Point, mapping +from rasterio.crs import CRS +from shapely.geometry import Point, Polygon, mapping from shapely.ops import unary_union -from urllib.request import urlopen from tqdm import tqdm import h3_helper -import GOSTrocks.rasterMisc as rMisc -import GOSTrocks.ntlMisc as ntl -from GOSTrocks.misc import tPrint -def get_global_table_from_s3(variable, bucket='wbg-geography01', prefix='Space2Stats/h3_stats_data/GLOBAL/', verbose=False, read_data=True): - """ Get pandas dataframe of all csv files in S3 bucket that match the variable name + +def get_global_table_from_s3( + variable, + bucket="wbg-geography01", + prefix="Space2Stats/h3_stats_data/GLOBAL/", + verbose=False, + read_data=True, +): + """Get pandas dataframe of all csv files in S3 bucket that match the variable name Parameters ---------- @@ -33,10 +40,10 @@ def get_global_table_from_s3(variable, bucket='wbg-geography01', prefix='Space2S If True, return results as pandas data frames for each sub-value in variable, otherwise returns a list of s3 prefixes for each sub-value, by default True """ - - s3client = boto3.client('s3') - - # Loop through the S3 bucket and get all the keys for files that are .tif + + s3client = boto3.client("s3") + + # Loop through the S3 bucket and get all the keys for files that are .tif prefix = f"{prefix}{variable}" more_results = True loops = 0 @@ -45,20 +52,22 @@ def get_global_table_from_s3(variable, bucket='wbg-geography01', prefix='Space2S if verbose: print(f"Completed loop: {loops}") if loops > 0: - objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix, ContinuationToken=token) + objects = s3client.list_objects_v2( + Bucket=bucket, Prefix=prefix, ContinuationToken=token + ) else: objects = s3client.list_objects_v2(Bucket=bucket, Prefix=prefix) - more_results = objects['IsTruncated'] + more_results = objects["IsTruncated"] if more_results: - token = objects['NextContinuationToken'] + token = objects["NextContinuationToken"] loops += 1 - for res in objects['Contents']: - if res['Key'].endswith('csv'): - cur_variable = os.path.basename(res['Key']).replace(".csv", "") + for res in objects["Contents"]: + if res["Key"].endswith("csv"): + cur_variable = os.path.basename(res["Key"]).replace(".csv", "") try: - good_res[cur_variable].append(res['Key']) + good_res[cur_variable].append(res["Key"]) except: - good_res[cur_variable] = [res['Key']] + good_res[cur_variable] = [res["Key"]] if read_data: for key, value in good_res.items(): for idx, val in enumerate(value): @@ -67,12 +76,19 @@ def get_global_table_from_s3(variable, bucket='wbg-geography01', prefix='Space2S else: cur_df = pd.concat([cur_df, pd.read_csv(f"s3://{bucket}/{val}")]) good_res[key] = cur_df - return(good_res) + return good_res -def calculate_value(in_shp, zonal_res, h3_level, feat_id, fractional_res=True, - zonal_res_id='id', default_sum='SUM'): - ''' tabulate hexabin stats for all bins that intersect shape in_shp +def calculate_value( + in_shp, + zonal_res, + h3_level, + feat_id, + fractional_res=True, + zonal_res_id="id", + default_sum="SUM", +): + """tabulate hexabin stats for all bins that intersect shape in_shp :param in_shp: shape of boundary to intersect with hexabins :type in_shp: shapely polygon @@ -91,25 +107,29 @@ def calculate_value(in_shp, zonal_res, h3_level, feat_id, fractional_res=True, :return: dictionary of results summarized based on type (SUM, MIN, MEAN, MAX) :rtype: Dictionary - ''' + """ + def get_intersection(admin_shp, hex_shp): - ''' get fraction of hex_shp that is inside admin_shp - ''' + """get fraction of hex_shp that is inside admin_shp""" if admin_shp.contains(hex_shp): - return(1) + return 1 else: - return(admin_shp.intersection(hex_shp).area/hex_shp.area) - - res = {'id':feat_id} + return admin_shp.intersection(hex_shp).area / hex_shp.area + + res = {"id": feat_id} process_h3 = True # Generate h3 cells that intersect current shape; if none are generated first time through, buffer # the geometry by a little bit, and then search again - while process_h3: - if in_shp.geom_type == 'Polygon': - sel_h3 = h3.polyfill(in_shp.__geo_interface__, h3_level, geo_json_conformant=True) + while process_h3: + if in_shp.geom_type == "Polygon": + sel_h3 = h3.polyfill( + in_shp.__geo_interface__, h3_level, geo_json_conformant=True + ) else: for cPoly in in_shp: - temp_h3 = h3.polyfill(cPoly.__geo_interface__, h3_level, geo_json_conformant=True) + temp_h3 = h3.polyfill( + cPoly.__geo_interface__, h3_level, geo_json_conformant=True + ) try: sel_h3 = sel_h3.union(temp_h3) except: @@ -119,45 +139,67 @@ def get_intersection(admin_shp, hex_shp): if len(sel_h3) > 0: hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - all_polys = gpd.GeoSeries(list(map(hex_poly, sel_h3)), index=sel_h3, crs="EPSG:4326") - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) - all_polys['shape_id'] = list(all_polys.index) + all_polys = gpd.GeoSeries( + list(map(hex_poly, sel_h3)), index=sel_h3, crs="EPSG:4326" + ) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) + all_polys["shape_id"] = list(all_polys.index) if fractional_res: - all_polys['inter_area'] = all_polys['geometry'].apply(lambda x: get_intersection(in_shp, x)) + all_polys["inter_area"] = all_polys["geometry"].apply( + lambda x: get_intersection(in_shp, x) + ) else: - all_polys['inter_area'] = 1 - all_polys = pd.merge(all_polys, zonal_res, left_on='shape_id', right_on=zonal_res_id) - for col in all_polys.columns: - if not col in ['inter_area','geometry','shape_id']: + all_polys["inter_area"] = 1 + all_polys = pd.merge( + all_polys, zonal_res, left_on="shape_id", right_on=zonal_res_id + ) + for col in all_polys.columns: + if not col in ["inter_area", "geometry", "shape_id"]: calc_type = default_sum - if "SUM" in col: calc_type = "SUM" - if "MIN" in col: calc_type = "MIN" - if "MAX" in col: calc_type = "MAX" - if "MEAN" in col: calc_type = "MEAN" + if "SUM" in col: + calc_type = "SUM" + if "MIN" in col: + calc_type = "MIN" + if "MAX" in col: + calc_type = "MAX" + if "MEAN" in col: + calc_type = "MEAN" try: - if calc_type == "SUM": # For sum columns, multiply column by inter_area and sum results - cur_val = sum(all_polys[col] * all_polys['inter_area']) + if ( + calc_type == "SUM" + ): # For sum columns, multiply column by inter_area and sum results + cur_val = sum(all_polys[col] * all_polys["inter_area"]) elif calc_type == "MIN": cur_val = all_polys[col].min() elif calc_type == "MAX": cur_val = all_polys[col].max() elif calc_type == "MEAN": - cur_val = sum(all_polys[col] * all_polys['inter_area'])/sum(all_polys['inter_area']) + cur_val = sum(all_polys[col] * all_polys["inter_area"]) / sum( + all_polys["inter_area"] + ) res[col] = cur_val except: pass try: - del(cur_val) + del cur_val except: pass else: pass - return(res) + return res + + +def connect_polygons_h3_stats( + inA, + stats_df, + h3_level, + id_col, + fractional_res=True, + zonal_res_id="id", + default_sum="SUM", +): + """merge stats from hexabin stats dataframe (stats_df) with the inA geodataframe -def connect_polygons_h3_stats(inA, stats_df, h3_level, id_col, fractional_res=True, - zonal_res_id='id', default_sum='SUM'): - ''' merge stats from hexabin stats dataframe (stats_df) with the inA geodataframe - :param inA: input boundary dataset :type inA: geopandas.GeoDataFrame :param stats_df: input hexabin stats dataset @@ -172,24 +214,35 @@ def connect_polygons_h3_stats(inA, stats_df, h3_level, id_col, fractional_res=Tr function sill pick up [SUM,MIN,MAX,MEAN], defaults to sum :type default_sum: string, optional - + :return: pandas dataframe with attached statistics and matching id from id_col :rtype: geopandas.GeoDataFrame - ''' + """ all_res = [] - for idx, row in inA.iterrows(): - all_res.append(calculate_value(row['geometry'], stats_df, h3_level, row[id_col], fractional_res, zonal_res_id, default_sum)) - ''' + for idx, row in inA.iterrows(): + all_res.append( + calculate_value( + row["geometry"], + stats_df, + h3_level, + row[id_col], + fractional_res, + zonal_res_id, + default_sum, + ) + ) + """ try: all_res.append(calculate_value(row['geometry'], stats_df, h3_level, row[id_col], fractional_res, zonal_res_id, default_sum)) except: print(f'Error processing {idx}') - ''' - - return(pd.DataFrame(all_res)) + """ + + return pd.DataFrame(all_res) + def generate_lvl0_lists(h3_lvl): - """ generate a dictionary with keys as lvl0 codes with all children at h3_lvl level as values + """generate a dictionary with keys as lvl0 codes with all children at h3_lvl level as values Parameters ---------- @@ -211,6 +264,7 @@ def generate_lvl0_lists(h3_lvl): return h3_lvl0_children + def calculate_zonal_h3_list(h3_list, raster_data, output_file=""): """_summary_ @@ -225,63 +279,86 @@ def calculate_zonal_h3_list(h3_list, raster_data, output_file=""): """ # Convert list of h3 cells to geometry hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - + all_polys = gpd.GeoSeries(list(map(hex_poly, h3_list)), index=h3_list, crs=4326) - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) - all_polys['shape_id'] = list(all_polys.index) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) + all_polys["shape_id"] = list(all_polys.index) res = rMisc.zonalStats(all_polys, raster_data) - res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) if output_file != "": res.to_csv(output_file) - - return(res) -def zonal_stats_numerical(gdf, gdf_id, raster_file, out_file, - buffer0=False, minVal=None, maxVal=None, verbose=False): - ''' Run zonal stats on a continuous raster file using a list of h3 cells - ''' + return res + + +def zonal_stats_numerical( + gdf, + gdf_id, + raster_file, + out_file, + buffer0=False, + minVal=None, + maxVal=None, + verbose=False, +): + """Run zonal stats on a continuous raster file using a list of h3 cells""" if verbose: - tPrint(f'Starting zonal stats on {raster_file}') + tPrint(f"Starting zonal stats on {raster_file}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) - res = rMisc.zonalStats(gdf, raster_file, minVal=minVal, maxVal=maxVal, verbose=verbose, reProj=True) - res = pd.DataFrame(res, columns=['SUM', 'MIN', 'MAX', 'MEAN']) - res['id'] = gdf[gdf_id].values + gdf["geometry"] = gdf["geometry"].buffer(0) + res = rMisc.zonalStats( + gdf, raster_file, minVal=minVal, maxVal=maxVal, verbose=verbose, reProj=True + ) + res = pd.DataFrame(res, columns=["SUM", "MIN", "MAX", "MEAN"]) + res["id"] = gdf[gdf_id].values if verbose: - tPrint(f'**** finished {cName}') - return({out_file:res}) + tPrint(f"**** finished {cName}") + return {out_file: res} -def zonal_stats_categories(gdf, gdf_id, raster_file, categories, out_file, - buffer0=False, verbose=False): - ''' Run zonal stats on a categorical raster file using a list of h3 cells - ''' +def zonal_stats_categories( + gdf, gdf_id, raster_file, categories, out_file, buffer0=False, verbose=False +): + """Run zonal stats on a categorical raster file using a list of h3 cells""" if verbose: - tPrint(f'Starting zonal stats on {raster_file}') + tPrint(f"Starting zonal stats on {raster_file}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) - res = rMisc.zonalStats(gdf, raster_file, rastType="C", unqVals=categories, verbose=verbose, reProj=True) - res = pd.DataFrame(res, columns=[f'c_{x}' for x in categories]) - res['id'] = gdf[gdf_id].values + gdf["geometry"] = gdf["geometry"].buffer(0) + res = rMisc.zonalStats( + gdf, raster_file, rastType="C", unqVals=categories, verbose=verbose, reProj=True + ) + res = pd.DataFrame(res, columns=[f"c_{x}" for x in categories]) + res["id"] = gdf[gdf_id].values if verbose: - tPrint(f'**** finished {cName}') - return({out_file:res}) - - -def zonal_stats_categorical(gdf, gdf_id, raster_file, category_raster_file, out_file, categories=None, reclass_dict=None, - buffer0=False, minVal=None, maxVal=None, verbose=False): - ''' Run zonal stats on a continuous raster file using a matching categorical raster - file and a list of h3 cells. For each defined category in the categorical - raster file, calculate the sum, min, max, mean for that category. - ''' - - tPrint(f'Starting zonal stats on {out_file}') + tPrint(f"**** finished {cName}") + return {out_file: res} + + +def zonal_stats_categorical( + gdf, + gdf_id, + raster_file, + category_raster_file, + out_file, + categories=None, + reclass_dict=None, + buffer0=False, + minVal=None, + maxVal=None, + verbose=False, +): + """Run zonal stats on a continuous raster file using a matching categorical raster + file and a list of h3 cells. For each defined category in the categorical + raster file, calculate the sum, min, max, mean for that category. + """ + + tPrint(f"Starting zonal stats on {out_file}") if buffer0: - gdf['geometry'] = gdf['geometry'].buffer(0) - - #extract category raster to gdf extent + gdf["geometry"] = gdf["geometry"].buffer(0) + + # extract category raster to gdf extent cat_d, cat_profile = rMisc.clipRaster(category_raster_file, gdf) # reclasify if necessary if not reclass_dict is None: @@ -291,23 +368,42 @@ def zonal_stats_categorical(gdf, gdf_id, raster_file, category_raster_file, out_ categories.append(key) # extract raster to gdf extent rast_d, rast_profile = rMisc.clipRaster(raster_file, gdf) - + # standardize categorical raster to zonal raster final_zonal_res = [] with rMisc.create_rasterio_inmemory(rast_profile, rast_d) as rast_src: with rMisc.create_rasterio_inmemory(cat_profile, cat_d) as cat_src: - cat_d, cat_profile = rMisc.standardizeInputRasters(cat_src, rast_src, resampling_type='nearest') + cat_d, cat_profile = rMisc.standardizeInputRasters( + cat_src, rast_src, resampling_type="nearest" + ) # Loop through each category for cur_cat in categories: cur_cat_d = (cat_d == cur_cat) * 1 cur_rast_d = rast_d * cur_cat_d - with rMisc.create_rasterio_inmemory(rast_profile, cur_rast_d) as cur_rast_src: - res = rMisc.zonalStats(gdf, cur_rast_src, minVal=minVal, maxVal=maxVal, verbose=verbose, reProj=True) - res = pd.DataFrame(res, columns=[f'{cur_cat}_SUM', f'{cur_cat}_MIN', f'{cur_cat}_MAX', f'{cur_cat}_MEAN']) - res['id'] = gdf[gdf_id].values - res.set_index('id', inplace=True) + with rMisc.create_rasterio_inmemory( + rast_profile, cur_rast_d + ) as cur_rast_src: + res = rMisc.zonalStats( + gdf, + cur_rast_src, + minVal=minVal, + maxVal=maxVal, + verbose=verbose, + reProj=True, + ) + res = pd.DataFrame( + res, + columns=[ + f"{cur_cat}_SUM", + f"{cur_cat}_MIN", + f"{cur_cat}_MAX", + f"{cur_cat}_MEAN", + ], + ) + res["id"] = gdf[gdf_id].values + res.set_index("id", inplace=True) final_zonal_res.append(res) ret = pd.concat(final_zonal_res, axis=1) if verbose: - tPrint(f'**** finished') - return({out_file:ret}) \ No newline at end of file + tPrint(f"**** finished") + return {out_file: ret} diff --git a/src/h3_helper.py b/src/h3_helper.py index 7682ca2..abc5a6f 100755 --- a/src/h3_helper.py +++ b/src/h3_helper.py @@ -1,39 +1,53 @@ -import sys, os, importlib, json, pickle -import folium, shapely, rasterio, matplotlib +import importlib +import json +import os +import pickle +import sys +from urllib.request import urlopen import contextily as ctx -import matplotlib.pyplot as plt +import folium +import geopandas as gpd +import matplotlib import matplotlib.patches as mpatches +import matplotlib.pyplot as plt import pandas as pd -import geopandas as gpd - -from rasterio.crs import CRS -from mpl_toolkits.axes_grid1 import make_axes_locatable +import rasterio +import shapely +from GOSTrocks.misc import tPrint from h3 import h3 +from mpl_toolkits.axes_grid1 import make_axes_locatable +from rasterio.crs import CRS from shapely.geometry import Polygon, mapping from shapely.ops import unary_union -from urllib.request import urlopen from tqdm import tqdm -from GOSTrocks.misc import tPrint def generate_h3_gdf(in_gdf, h3_level=7): - ''' Generate a GeoDataFrame of h3 grid cells from an input geodataframe - - :param in_gdf: geodataframe from which to create h3 cells - :type in_gdf: geopandas.GeoDataFrame - ''' + """Generate a GeoDataFrame of h3 grid cells from an input geodataframe + + :param in_gdf: geodataframe from which to create h3 cells + :type in_gdf: geopandas.GeoDataFrame + """ try: del final_hexs except: pass try: - final_hexs = list(h3.polyfill(in_gdf.unary_union.__geo_interface__, h3_level, geo_json_conformant=True)) + final_hexs = list( + h3.polyfill( + in_gdf.unary_union.__geo_interface__, h3_level, geo_json_conformant=True + ) + ) except: - for cPoly in tqdm(in_gdf.unary_union, desc=f"Generating h3 grid level {h3_level}"): - all_hexs = list(h3.polyfill(cPoly.__geo_interface__, h3_level, geo_json_conformant=True)) - try: + for cPoly in tqdm( + in_gdf.unary_union, desc=f"Generating h3 grid level {h3_level}" + ): + all_hexs = list( + h3.polyfill(cPoly.__geo_interface__, h3_level, geo_json_conformant=True) + ) + try: final_hexs = final_hexs + all_hexs except: final_hexs = all_hexs @@ -41,14 +55,22 @@ def generate_h3_gdf(in_gdf, h3_level=7): final_hexs = list(set(final_hexs)) hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - all_polys = gpd.GeoSeries(list(map(hex_poly, final_hexs)), index=final_hexs, crs="EPSG:4326") - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) - all_polys['shape_id'] = list(all_polys.index) - return(all_polys) + all_polys = gpd.GeoSeries( + list(map(hex_poly, final_hexs)), index=final_hexs, crs="EPSG:4326" + ) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) + all_polys["shape_id"] = list(all_polys.index) + return all_polys -def generate_lvl0_lists(h3_lvl, return_gdf=False, buffer0=False, - read_pickle=True, pickle_file = "h0_dictionary_of_h{lvl}_geodata_frames.pickle"): - """ generate a dictionary with keys as lvl0 codes with all children at h3_lvl level as values + +def generate_lvl0_lists( + h3_lvl, + return_gdf=False, + buffer0=False, + read_pickle=True, + pickle_file="h0_dictionary_of_h{lvl}_geodata_frames.pickle", +): + """generate a dictionary with keys as lvl0 codes with all children at h3_lvl level as values Parameters ---------- @@ -59,7 +81,7 @@ def generate_lvl0_lists(h3_lvl, return_gdf=False, buffer0=False, buffer0 : bool, optional buffer the h3 lvl 0 cells by 0 to fix inherent topological errors, by default False read_pickle : bool, optional - Optionally choose the read resulting data from a [ickle file defined by pickle_file, by default True. If pickle + Optionally choose the read resulting data from a [ickle file defined by pickle_file, by default True. If pickle file is not present, function will continue to generate results as if flag was set to False pickle_file : str, optional Path of pickle file to read if read_pickle is set to True @@ -72,15 +94,24 @@ def generate_lvl0_lists(h3_lvl, return_gdf=False, buffer0=False, if read_pickle: try: pickle_file = pickle_file.format(lvl=h3_lvl) - pickle_path = os.path.join(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))), pickle_file) - print(f"Loading pickle file {pickle_file}: it exists {os.path.exists(pickle_path)}") - with open(pickle_path, 'rb') as handle: + pickle_path = os.path.join( + os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))), + pickle_file, + ) + print( + f"Loading pickle file {pickle_file}: it exists {os.path.exists(pickle_path)}" + ) + with open(pickle_path, "rb") as handle: xx = pickle.load(handle) - return(xx) + return xx except: - #print("Could not load pickle file, continuing to process h0 manually") - raise(ValueError("Could not load pickle file, continuing to process h0 manually")) - + # print("Could not load pickle file, continuing to process h0 manually") + raise ( + ValueError( + "Could not load pickle file, continuing to process h0 manually" + ) + ) + # Get list of all h3 lvl 0 cells h3_lvl0 = list(h3.get_res0_indexes()) @@ -89,21 +120,34 @@ def generate_lvl0_lists(h3_lvl, return_gdf=False, buffer0=False, for h3_0 in h3_lvl0: h3_children = list(h3.h3_to_children(h3_0, h3_lvl)) if return_gdf: - hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - all_polys = gpd.GeoSeries(list(map(hex_poly, h3_children)), index=h3_children, crs=4326) - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) + hex_poly = lambda hex_id: Polygon( + h3.h3_to_geo_boundary(hex_id, geo_json=True) + ) + all_polys = gpd.GeoSeries( + list(map(hex_poly, h3_children)), index=h3_children, crs=4326 + ) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) if buffer0: - all_polys['geometry'] = all_polys['geometry'].apply(lambda x: x.buffer(0)) - all_polys['shape_id'] = list(all_polys.index) - + all_polys["geometry"] = all_polys["geometry"].apply( + lambda x: x.buffer(0) + ) + all_polys["shape_id"] = list(all_polys.index) + h3_lvl0_children[h3_0] = all_polys else: h3_lvl0_children[h3_0] = h3_children return h3_lvl0_children -def generate_lvl1_lists(h3_lvl, return_gdf=False, buffer0=False, - read_pickle=True, pickle_file = "h1_dictionary_of_h{lvl}_geodata_frames.pickle", write_pickle=False): - """ generate a dictionary with keys as lvl1 codes with all children at h3_lvl level as values + +def generate_lvl1_lists( + h3_lvl, + return_gdf=False, + buffer0=False, + read_pickle=True, + pickle_file="h1_dictionary_of_h{lvl}_geodata_frames.pickle", + write_pickle=False, +): + """generate a dictionary with keys as lvl1 codes with all children at h3_lvl level as values Parameters ---------- @@ -114,7 +158,7 @@ def generate_lvl1_lists(h3_lvl, return_gdf=False, buffer0=False, buffer0 : bool, optional buffer the h3 lvl 0 cells by 0 to fix inherent topological errors, by default False read_pickle : bool, optional - Optionally choose the read resulting data from a [ickle file defined by pickle_file, by default True. If pickle + Optionally choose the read resulting data from a [ickle file defined by pickle_file, by default True. If pickle file is not present, function will continue to generate results as if flag was set to False pickle_file : str, optional Path of pickle file to read if read_pickle is set to True @@ -125,113 +169,160 @@ def generate_lvl1_lists(h3_lvl, return_gdf=False, buffer0=False, dictionary with keys as lvl0 codes with all children at h3_lvl level as values; returns a GeoDataFrame if return_gdf is True """ pickle_file = pickle_file.format(lvl=h3_lvl) - pickle_path = os.path.join(os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))), pickle_file) + pickle_path = os.path.join( + os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))), + pickle_file, + ) if read_pickle: try: - print(f"Loading pickle file {pickle_file}: it exists {os.path.exists(pickle_path)}") - with open(pickle_path, 'rb') as handle: + print( + f"Loading pickle file {pickle_file}: it exists {os.path.exists(pickle_path)}" + ) + with open(pickle_path, "rb") as handle: xx = pickle.load(handle) - return(xx) + return xx except: print("Could not load pickle file, continuing to process h1 manually") - raise(ValueError("Could not load pickle file, exiting. Set read_pickle to False to generate list")) - + raise ( + ValueError( + "Could not load pickle file, exiting. Set read_pickle to False to generate list" + ) + ) + # Get list of all h3 lvl 0 cells h3_lvl0 = list(h3.get_res0_indexes()) # Generate list of all children of h3 lvl 1 cells h3_lvl1_children = {} - for h3_0 in h3_lvl0: # Identify all lvl 0 cells + for h3_0 in h3_lvl0: # Identify all lvl 0 cells h3_children = list(h3.h3_to_children(h3_0, 1)) - for h3_1 in h3_children: # For current lvl 0 cell, loop through all level 1 children + for ( + h3_1 + ) in h3_children: # For current lvl 0 cell, loop through all level 1 children h3_children_1 = list(h3.h3_to_children(h3_1, h3_lvl)) if return_gdf: - hex_poly = lambda hex_id: Polygon(h3.h3_to_geo_boundary(hex_id, geo_json=True)) - all_polys = gpd.GeoSeries(list(map(hex_poly, h3_children_1)), index=h3_children_1, crs=4326) - all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=['geometry']) + hex_poly = lambda hex_id: Polygon( + h3.h3_to_geo_boundary(hex_id, geo_json=True) + ) + all_polys = gpd.GeoSeries( + list(map(hex_poly, h3_children_1)), index=h3_children_1, crs=4326 + ) + all_polys = gpd.GeoDataFrame(all_polys, crs=4326, columns=["geometry"]) if buffer0: - all_polys['geometry'] = all_polys['geometry'].apply(lambda x: x.buffer(0)) - all_polys['shape_id'] = list(all_polys.index) - + all_polys["geometry"] = all_polys["geometry"].apply( + lambda x: x.buffer(0) + ) + all_polys["shape_id"] = list(all_polys.index) + h3_lvl1_children[h3_1] = all_polys else: h3_lvl1_children[h3_1] = h3_children_1 - + if write_pickle: if not os.path.exists(pickle_path): - with open(pickle_path, 'wb') as handle: + with open(pickle_path, "wb") as handle: pickle.dump(h3_lvl1_children, handle, protocol=pickle.HIGHEST_PROTOCOL) - + return h3_lvl1_children -def map_choropleth(sub, map_column, thresh=[], colour_ramp = 'Reds', invert=False, map_epsg=3857, legend_loc='upper right'): - ''' generate a static map of variables in GeoDataFrame sub - - :param sub: GeoDataFrame with geometry and column to map - :type sub: GeoPandas.GeoDataFrame - :param map_column: Name of column in sub to map - :type map_column: string - :param thresh: list of values to classify data in map_column - :type thresh: list of ints - ''' - try: - sub = sub.to_crs(map_epsg) - except: - sub.crs = 4326 - sub = sub.to_crs(map_epsg) - - thresh=[] - map_sub = sub.copy() - cmap = matplotlib.cm.get_cmap(colour_ramp) - fig, ax = plt.subplots(figsize=(15,15)) - proj = CRS.from_epsg(map_epsg) - - # create map column in sub, based on re-mapping of column map_column - if len(thresh) == 0: - split = [0,0.2,0.4,0.6,0.8,1] - thresh = [x for x in map_sub[map_column].quantile(split).values] - thresh.insert(0,0) - - map_sub['map'] = pd.cut(map_sub[map_column], thresh, labels=list(range(0, len(thresh)-1))) - - # [x/max(thresh) for x in thresh] - cmap_divisions = [x/100 for x in list(range(0,101,20))] - # map features not included in grouping - sel_mixed = map_sub.loc[map_sub['map'].isna()] - mismatch_color = 'azure' - mismatch_edge = 'darkblue' - cur_patch = mpatches.Patch(facecolor=mismatch_color, edgecolor=mismatch_edge, hatch="///", label=f"Mismatch [{sel_mixed.shape[0]}]") - all_labels = [cur_patch] - for lbl, data in map_sub.groupby('map'): - cur_color = cmap(cmap_divisions[int(lbl)]) - if invert: - cur_color = cmap(1 - cmap_divisions[int(lbl)]) - data.plot(color=cur_color, ax=ax, linewidth=0.1) - cur_patch = mpatches.Patch(color=cur_color, label=f'{data[map_column].min()} - {data[map_column].max()} [{data.shape[0]}]') - all_labels.append(cur_patch) - - sel_mixed.plot(color=mismatch_color, edgecolor=mismatch_edge, hatch="//////", ax=ax, label=False, linewidth=2) - - ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerBackground, crs=proj) #zorder=-10, 'EPSG:4326' - ax.legend(handles=all_labels, loc=legend_loc) - ax = ax.set_axis_off() - - return(ax) - -def static_map_h3(sub, map_epsg=3857, legend_loc='upper right'): - ''' generate a static map of the h3 grid in sub - ''' - try: - sub = sub.to_crs(map_epsg) - except: - sub.crs = 4326 - sub = sub.to_crs(map_epsg) - - fig, ax = plt.subplots(figsize=(15,15)) - proj = CRS.from_epsg(map_epsg) - - sub.plot(color='grey', ax=ax, linewidth=0.1) - - ctx.add_basemap(ax, source=ctx.providers.Stamen.TonerBackground, crs=proj) #zorder=-10, 'EPSG:4326' - ax = ax.set_axis_off() - return(ax) \ No newline at end of file + +def map_choropleth( + sub, + map_column, + thresh=[], + colour_ramp="Reds", + invert=False, + map_epsg=3857, + legend_loc="upper right", +): + """generate a static map of variables in GeoDataFrame sub + + :param sub: GeoDataFrame with geometry and column to map + :type sub: GeoPandas.GeoDataFrame + :param map_column: Name of column in sub to map + :type map_column: string + :param thresh: list of values to classify data in map_column + :type thresh: list of ints + """ + try: + sub = sub.to_crs(map_epsg) + except: + sub.crs = 4326 + sub = sub.to_crs(map_epsg) + + thresh = [] + map_sub = sub.copy() + cmap = matplotlib.cm.get_cmap(colour_ramp) + fig, ax = plt.subplots(figsize=(15, 15)) + proj = CRS.from_epsg(map_epsg) + + # create map column in sub, based on re-mapping of column map_column + if len(thresh) == 0: + split = [0, 0.2, 0.4, 0.6, 0.8, 1] + thresh = [x for x in map_sub[map_column].quantile(split).values] + thresh.insert(0, 0) + + map_sub["map"] = pd.cut( + map_sub[map_column], thresh, labels=list(range(0, len(thresh) - 1)) + ) + + # [x/max(thresh) for x in thresh] + cmap_divisions = [x / 100 for x in list(range(0, 101, 20))] + # map features not included in grouping + sel_mixed = map_sub.loc[map_sub["map"].isna()] + mismatch_color = "azure" + mismatch_edge = "darkblue" + cur_patch = mpatches.Patch( + facecolor=mismatch_color, + edgecolor=mismatch_edge, + hatch="///", + label=f"Mismatch [{sel_mixed.shape[0]}]", + ) + all_labels = [cur_patch] + for lbl, data in map_sub.groupby("map"): + cur_color = cmap(cmap_divisions[int(lbl)]) + if invert: + cur_color = cmap(1 - cmap_divisions[int(lbl)]) + data.plot(color=cur_color, ax=ax, linewidth=0.1) + cur_patch = mpatches.Patch( + color=cur_color, + label=f"{data[map_column].min()} - {data[map_column].max()} [{data.shape[0]}]", + ) + all_labels.append(cur_patch) + + sel_mixed.plot( + color=mismatch_color, + edgecolor=mismatch_edge, + hatch="//////", + ax=ax, + label=False, + linewidth=2, + ) + + ctx.add_basemap( + ax, source=ctx.providers.Stamen.TonerBackground, crs=proj + ) # zorder=-10, 'EPSG:4326' + ax.legend(handles=all_labels, loc=legend_loc) + ax = ax.set_axis_off() + + return ax + + +def static_map_h3(sub, map_epsg=3857, legend_loc="upper right"): + """generate a static map of the h3 grid in sub""" + try: + sub = sub.to_crs(map_epsg) + except: + sub.crs = 4326 + sub = sub.to_crs(map_epsg) + + fig, ax = plt.subplots(figsize=(15, 15)) + proj = CRS.from_epsg(map_epsg) + + sub.plot(color="grey", ax=ax, linewidth=0.1) + + ctx.add_basemap( + ax, source=ctx.providers.Stamen.TonerBackground, crs=proj + ) # zorder=-10, 'EPSG:4326' + ax = ax.set_axis_off() + return ax diff --git a/src/space2stats_data_config.py b/src/space2stats_data_config.py index 1797cfb..daee6a6 100755 --- a/src/space2stats_data_config.py +++ b/src/space2stats_data_config.py @@ -1,21 +1,24 @@ -import sys, os, json +import json +import os +import sys + import geojson class s2s_geo_data: def __init__(self, json_path): - """ Extract metatdata and processing information for input geospatial layers + """Extract metatdata and processing information for input geospatial layers Args: json_path (string): path to json file to process """ - with open(json_path, 'r') as in_data: + with open(json_path, "r") as in_data: in_json = json.load(in_data) self.data_info = in_json - def get_path(self, yyyy='', mm='', dd=''): - """ Get path to geospatial data for processing + def get_path(self, yyyy="", mm="", dd=""): + """Get path to geospatial data for processing Args: yyyy (str, optional): specific year to process. Defaults to ''. @@ -24,5 +27,6 @@ def get_path(self, yyyy='', mm='', dd=''): """ inD = self.data_info.copy() - s3_path = os.path.join(inD['s3_bucket_base'], ) - + s3_path = os.path.join( + inD["s3_bucket_base"], + )