From 8303910b07ce5a83d7c2e346f4b3a7e50fc69fe5 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 20:02:51 -0400 Subject: [PATCH 1/7] Factor out dynamic chunking func --- feedstock/recipe.py | 79 --------------------------------------------- 1 file changed, 79 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 7ab6546b..c8eab744 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -129,11 +129,9 @@ iids_filtered = list(set(iids) - iids_to_skip) logger.info(f"Pruned {len(iids) - len(iids_filtered)}/{len(iids)} iids from input list") - if prune_iids: iids_filtered = iids_filtered[0:20] - # Now that we have the iids that are not yet ingested, we can prune the full iid_info_dict and extract the 'id' field iid_info_dict_filtered = {k: v for k, v in iid_info_dict.items() if k in iids_filtered} dataset_ids_filtered = [v["id"] for v in iid_info_dict_filtered.values()] @@ -173,83 +171,6 @@ def combine_dicts(dicts): # Print the actual urls logger.debug(f"{recipe_dict = }") - -## Dynamic Chunking Wrapper -def dynamic_chunking_func(ds: xr.Dataset) -> Dict[str, int]: - import warnings - - # trying to import inside the function - from dynamic_chunks.algorithms import ( - even_divisor_algo, - iterative_ratio_increase_algo, - NoMatchingChunks, - ) - - logger.info(f"Input Dataset for dynamic chunking {ds =}") - - target_chunk_size = "150MB" - target_chunks_aspect_ratio = { - "time": 10, - "x": 1, - "i": 1, - "ni": 1, - "xh": 1, - "nlon": 1, - "lon": 1, # TODO: Maybe import all the known spatial dimensions from xmip? - "y": 1, - "j": 1, - "nj": 1, - "yh": 1, - "nlat": 1, - "lat": 1, - } - size_tolerance = 0.5 - - # Some datasets are smaller than the target chunk size and should not be chunked at all - if ds.nbytes < parse_bytes(target_chunk_size): - target_chunks = dict(ds.dims) - - else: - try: - target_chunks = even_divisor_algo( - ds, - target_chunk_size, - target_chunks_aspect_ratio, - size_tolerance, - allow_extra_dims=True, - ) - - except NoMatchingChunks: - warnings.warn( - "Primary algorithm using even divisors along each dimension failed " - "with. Trying secondary algorithm." - f"Input {ds=}" - ) - try: - target_chunks = iterative_ratio_increase_algo( - ds, - target_chunk_size, - target_chunks_aspect_ratio, - size_tolerance, - allow_extra_dims=True, - ) - except NoMatchingChunks: - raise ValueError( - ( - "Could not find any chunk combinations satisfying " - "the size constraint with either algorithm." - f"Input {ds=}" - ) - ) - # If something fails - except Exception as e: - raise e - except Exception as e: - raise e - logger.info(f"Dynamic Chunking determined {target_chunks =}") - return target_chunks - - ## Create the recipes recipes = {} From aa06723db1536d32664338db65c4ba74ad08ba73 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 00:03:23 +0000 Subject: [PATCH 2/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index c8eab744..67940d39 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -2,8 +2,6 @@ """Modified transforms from Pangeo Forge""" import apache_beam as beam -from typing import Dict -from dask.utils import parse_bytes from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs @@ -19,7 +17,6 @@ ) import logging import os -import xarray as xr import yaml from tqdm.auto import tqdm From a98d3c938b8c7c03dd8ddb4f4be1d6b0a5b14b03 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 20:04:43 -0400 Subject: [PATCH 3/7] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 67940d39..abab26a2 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -5,7 +5,7 @@ from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs -from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor +from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor, dynamic_chunkinc_func from pangeo_forge_esgf.client import ESGFClient from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( From 8d5abc9d33464f9bdf443258ee6dec63b1dc1dd1 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 00:04:49 +0000 Subject: [PATCH 4/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index abab26a2..67940d39 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -5,7 +5,7 @@ from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs -from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor, dynamic_chunkinc_func +from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor from pangeo_forge_esgf.client import ESGFClient from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( From 314469d44deb57801b186303983f20c4c5e159b5 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 20:28:04 -0400 Subject: [PATCH 5/7] Update requirements.txt --- feedstock/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/feedstock/requirements.txt b/feedstock/requirements.txt index 5efdfa90..5d052a0c 100644 --- a/feedstock/requirements.txt +++ b/feedstock/requirements.txt @@ -1,4 +1,5 @@ -leap-data-management-utils==0.0.7 +#leap-data-management-utils==0.0.7 +git+https://github.com/leap-stc/leap-data-management-utils.git@move-dyn-chunking-func #pangeo-forge-esgf==0.2.0 git+https://github.com/jbusecke/pangeo-forge-esgf.git@new-request-scheme dynamic-chunks==0.0.3 From d3841a4d7c4e3f6ace07a4ac166b81162728dd37 Mon Sep 17 00:00:00 2001 From: Julius Busecke Date: Tue, 7 May 2024 20:29:42 -0400 Subject: [PATCH 6/7] Update recipe.py --- feedstock/recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 67940d39..2376fe6a 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -5,7 +5,7 @@ from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs -from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor +from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor, dynamic_chunking_func from pangeo_forge_esgf.client import ESGFClient from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import ( From ecba5747a7aa63e402d12177942af89b6a7537b6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 8 May 2024 00:29:47 +0000 Subject: [PATCH 7/7] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- feedstock/recipe.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/feedstock/recipe.py b/feedstock/recipe.py index 2376fe6a..dd481deb 100644 --- a/feedstock/recipe.py +++ b/feedstock/recipe.py @@ -5,7 +5,11 @@ from pangeo_forge_esgf import setup_logging from leap_data_management_utils import CMIPBQInterface, LogCMIPToBigQuery from leap_data_management_utils.data_management_transforms import Copy, InjectAttrs -from leap_data_management_utils.cmip_transforms import TestDataset, Preprocessor, dynamic_chunking_func +from leap_data_management_utils.cmip_transforms import ( + TestDataset, + Preprocessor, + dynamic_chunking_func, +) from pangeo_forge_esgf.client import ESGFClient from pangeo_forge_recipes.patterns import pattern_from_file_sequence from pangeo_forge_recipes.transforms import (