From 981d676ef235313630cadd6985e26de668b4caf1 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 28 Oct 2024 14:34:04 +0100 Subject: [PATCH 01/68] First attempt at adding derived forcings --- mllam_data_prep/config.py | 1 + mllam_data_prep/create_dataset.py | 66 ++++++++++++++++++++- mllam_data_prep/ops/derive_variables.py | 79 +++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 1 deletion(-) create mode 100644 mllam_data_prep/ops/derive_variables.py diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 6112a0c..2cd439f 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -171,6 +171,7 @@ class InputDataset: dim_mapping: Dict[str, DimMapping] target_output_variable: str attributes: Dict[str, Any] = None + derive_variables: bool = False @dataclass diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index ad14704..69249d5 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,6 +10,10 @@ from . import __version__ from .config import Config, InvalidConfigException +from .ops.derive_variables import ( + derive_toa_radiation, + get_variables_for_deriving_toa_radiation, +) from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -116,9 +120,20 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] + # Check if the variables should be derived/calculated + derive_input_variables = input_config.derive_variables or False + + if derive_input_variables: + logger.info( + f"Get variables needed to derive additional/external forcings: {variables}" + ) + variables_to_extract = get_variables_for_forcing_derivation(variables) + else: + variables_to_extract = variables + logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds = load_and_subset_dataset(fp=path, variables=variables) + ds = load_and_subset_dataset(fp=path, variables=variables_to_extract) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex _check_dataset_attributes( @@ -127,6 +142,9 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) + if derive_input_variables: + ds = derive_forcings(ds, variables, variables_to_extract) + dim_mapping = input_config.dim_mapping # check that there is an entry for each arch dimension @@ -266,3 +284,49 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): logger.info(f"Wrote training-ready dataset to {fp_zarr}") logger.info(ds) + + +def get_variables_for_forcing_derivation(variables): + """ + Extract the variables needed for deriving the external/additional forcings + """ + if isinstance(variables, dict): + raise Exception("Not implemented yet") + elif isinstance(variables, list): + variables_to_extract = set() + for var in variables: + if var == "toa_radiation": + vars = get_variables_for_deriving_toa_radiation() + else: + raise Exception(f"Function for deriving {var} is not implemented yet!") + + # Add variable names to set (only adds unique variables) + variables_to_extract.update(vars) + + # Turn the set into a list + variables_to_extract = list(variables_to_extract) + + return variables_to_extract + + +def derive_forcings(ds, variables, variables_to_extract): + """ + Derive the specified forcings + """ + if isinstance(variables, dict): + raise Exception("Not implemented yet") + elif isinstance(variables, list): + for var in variables: + if var == "toa_radiation": + ds = derive_toa_radiation(ds) + else: + raise Exception(f"Function for deriving {var} is not implemented yet!") + + # Drop all the unneeded variables that have only been used to derive the + # forcing variables. Need to keep any variables that are also coordinates! + variables_to_drop = [ + var for var in variables_to_extract if var not in list(ds._coord_names) + ] + ds = ds.drop_vars(variables_to_drop, errors="ignore") + + return ds diff --git a/mllam_data_prep/ops/derive_variables.py b/mllam_data_prep/ops/derive_variables.py new file mode 100644 index 0000000..9ad3be9 --- /dev/null +++ b/mllam_data_prep/ops/derive_variables.py @@ -0,0 +1,79 @@ +import dask.array as da +import numpy as np +import xarray as xr + + +def derive_toa_radiation(ds): + """ + Derive approximate TOA radiation (instantaneous values [W*m**-2]) + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive TOA radiation + + Returns + ------- + ds: xr.Dataset + The dataset with TOA radiation added + """ + + # Need to construct a new dataset with chunks since + # lat and lon are coordinates and are therefore eagerly loaded + ds_dict = {} + ds_dict["lat"] = (list(ds.lat.dims), da.from_array(ds.lat.values, chunks=(-1, -1))) + ds_dict["lon"] = (list(ds.lon.dims), da.from_array(ds.lon.values, chunks=(-1, -1))) + ds_dict["t"] = (list(ds.time.dims), da.from_array(ds.time.values, chunks=(10))) + ds_chunks = xr.Dataset(ds_dict) + + # Calculate TOA radiation + toa_radiation = calc_toa_radiation(ds_chunks) + + # Assign to the dataset + ds = ds.assign(toa_radiation=toa_radiation) + + return ds + + +def calc_toa_radiation(ds): + """ + Function for calculation top-of-the-atmosphere radiation + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive TOA radiation + + Returns + ------- + toa_radiation: xr.DataArray + TOA radiation data-array + """ + # Solar constant + E0 = 1366 # W*m**-2 + + day = ds.t.dt.dayofyear + hr_utc = ds.t.dt.hour + + # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. + dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) + + hr_lst = hr_utc + ds.lon / 15 + hr_angle = 15 * (hr_lst - 12) + + # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. + cos_sza = np.sin(ds.lat * np.pi / 180) * np.sin(dec) + np.cos( + ds.lat * np.pi / 180 + ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) + + # Where TOA radiation is negative, set to 0 + toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + + return toa_radiation + + +def get_variables_for_deriving_toa_radiation(): + """ + Get list of variables needed for the TOA radiation calculation + """ + return ["lat", "lon", "time"] From 79a94db939ff089d221a190b836dd1a03160c452 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 08:09:29 +0000 Subject: [PATCH 02/68] Re-structure approach - Update the configuration file so that we list the dependencies and the method used to calculate the derived variable instead of having a flag to say that the variables should be derived. This approach is temporary and might be revised soon. - Add a new class in mllam_data_prep/config.py for derived variables to distinguish them from non-derived variables. - Updates to mllam_data_prep/ops/loading.py to distinguish between derived and non-derived variables. - Move all functions related to forcing derivations to a new and renamed function (mllam_data_prep/ops/forcings.py). --- example.danra.yaml | 22 ++++++ mllam_data_prep/config.py | 22 +++++- mllam_data_prep/create_dataset.py | 68 ++----------------- .../ops/{derive_variables.py => forcings.py} | 55 +++++++++++++-- mllam_data_prep/ops/loading.py | 21 +++++- 5 files changed, 113 insertions(+), 75 deletions(-) rename mllam_data_prep/ops/{derive_variables.py => forcings.py} (57%) diff --git a/example.danra.yaml b/example.danra.yaml index 28ae1af..0e6e7be 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -73,6 +73,28 @@ inputs: name_format: f"{var_name}" target_output_variable: forcing + danra_additional_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + variables: + - toa_radiation: + dependencies: + - time + - lat + - lon + method: derive_toa_radiation + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: f"{var_name}" + target_output_variable: forcing + danra_lsm: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/lsm.zarr dims: [x, y] diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 2cd439f..354aaf9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -64,6 +64,22 @@ class ValueSelection: units: str = None +@dataclass +class DerivedVariable: + """ + Defines a derived variables, where the dependencies (variables used + in the calculation) and the method (function for calculations) are + specified. + + Attributes: + dependencies: The variables to use in the calculation. + method: The methpd with which to derive the variable. + """ + + dependencies: List[str] + method: str = None + + @dataclass class DimMapping: """ @@ -167,11 +183,13 @@ class InputDataset: path: str dims: List[str] - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] + variables: Union[ + List[Union[str, Dict[str, DerivedVariable]]], + Dict[str, Dict[str, ValueSelection]], + ] dim_mapping: Dict[str, DimMapping] target_output_variable: str attributes: Dict[str, Any] = None - derive_variables: bool = False @dataclass diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 69249d5..a6813c3 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,10 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .ops.derive_variables import ( - derive_toa_radiation, - get_variables_for_deriving_toa_radiation, -) +from .ops.forcings import derive_forcings from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -120,20 +117,9 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] - # Check if the variables should be derived/calculated - derive_input_variables = input_config.derive_variables or False - - if derive_input_variables: - logger.info( - f"Get variables needed to derive additional/external forcings: {variables}" - ) - variables_to_extract = get_variables_for_forcing_derivation(variables) - else: - variables_to_extract = variables - logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds = load_and_subset_dataset(fp=path, variables=variables_to_extract) + ds = load_and_subset_dataset(fp=path, variables=variables) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex _check_dataset_attributes( @@ -142,8 +128,8 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - if derive_input_variables: - ds = derive_forcings(ds, variables, variables_to_extract) + # Derive forcing variables (if applicable) + ds = derive_forcings(ds, variables) dim_mapping = input_config.dim_mapping @@ -284,49 +270,3 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): logger.info(f"Wrote training-ready dataset to {fp_zarr}") logger.info(ds) - - -def get_variables_for_forcing_derivation(variables): - """ - Extract the variables needed for deriving the external/additional forcings - """ - if isinstance(variables, dict): - raise Exception("Not implemented yet") - elif isinstance(variables, list): - variables_to_extract = set() - for var in variables: - if var == "toa_radiation": - vars = get_variables_for_deriving_toa_radiation() - else: - raise Exception(f"Function for deriving {var} is not implemented yet!") - - # Add variable names to set (only adds unique variables) - variables_to_extract.update(vars) - - # Turn the set into a list - variables_to_extract = list(variables_to_extract) - - return variables_to_extract - - -def derive_forcings(ds, variables, variables_to_extract): - """ - Derive the specified forcings - """ - if isinstance(variables, dict): - raise Exception("Not implemented yet") - elif isinstance(variables, list): - for var in variables: - if var == "toa_radiation": - ds = derive_toa_radiation(ds) - else: - raise Exception(f"Function for deriving {var} is not implemented yet!") - - # Drop all the unneeded variables that have only been used to derive the - # forcing variables. Need to keep any variables that are also coordinates! - variables_to_drop = [ - var for var in variables_to_extract if var not in list(ds._coord_names) - ] - ds = ds.drop_vars(variables_to_drop, errors="ignore") - - return ds diff --git a/mllam_data_prep/ops/derive_variables.py b/mllam_data_prep/ops/forcings.py similarity index 57% rename from mllam_data_prep/ops/derive_variables.py rename to mllam_data_prep/ops/forcings.py index 9ad3be9..207ded5 100644 --- a/mllam_data_prep/ops/derive_variables.py +++ b/mllam_data_prep/ops/forcings.py @@ -1,6 +1,53 @@ import dask.array as da import numpy as np import xarray as xr +from loguru import logger + + +def derive_forcings(ds, variables): + """ + Derive the specified forcings + + Parameters + --------- + ds : xr.Dataset + The loaded and subsetted dataset + variables: list/dict + List or dictionary with variables + + Returns + ------- + ds : xr.Dataset + Dataset with derived variables included + """ + variables_to_derive = { + k: v for elem in variables if isinstance(elem, dict) for (k, v) in elem.items() + } + + if variables_to_derive == {}: + pass + else: + logger.info("Deriving additional forcings") + for _, derived_var in variables_to_derive.items(): + # Get the function defined in the config file + func = globals()[derived_var.method] + # Currently, we're passing the whole dataset + ds = func(ds) + + # Drop all the unneeded variables that have only been used to derive the + # forcing variables. HOWEVER, it's necessary to keep variables that are + # also coordinates! + derived_variable_dependencies = [] + for _, derived_var in variables_to_derive.items(): + derived_variable_dependencies += derived_var.dependencies + variables_to_drop = [ + var + for var in derived_variable_dependencies + if var not in list(ds._coord_names) + ] + ds = ds.drop_vars(variables_to_drop) + + return ds def derive_toa_radiation(ds): @@ -17,6 +64,7 @@ def derive_toa_radiation(ds): ds: xr.Dataset The dataset with TOA radiation added """ + logger.info("Calculating top-of-atmosphere radiation") # Need to construct a new dataset with chunks since # lat and lon are coordinates and are therefore eagerly loaded @@ -70,10 +118,3 @@ def calc_toa_radiation(ds): toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) return toa_radiation - - -def get_variables_for_deriving_toa_radiation(): - """ - Get list of variables needed for the TOA radiation calculation - """ - return ["lat", "lon", "time"] diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 955fafd..43b1372 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -45,13 +45,30 @@ def load_and_subset_dataset(fp, variables): ) ds_subset[var] = da elif isinstance(variables, list): + # Check if the variables in a section are all derived variables or not + if all(isinstance(var, dict) for var in variables): + variables_to_extract = set() + for var in variables: + for _, var_dict in var.items(): + variables_to_extract.update(var_dict.dependencies) + elif all(isinstance(var, str) for var in variables): + variables_to_extract = variables + else: + raise TypeError( + "Expected either a list of strings or a list of dicts " + "but got a list of mixed types. If you are trying to derive " + "variables they should go in its own input section." + ) + + # Subset the dataset try: - ds_subset = ds[variables] + ds_subset = ds[variables_to_extract] except KeyError as ex: raise KeyError( - f"Could not find the all variables `{variables}` in the dataset. " + f"Could not find the all variables `{variables_to_extract}` in the dataset. " f"The available variables are {list(ds.data_vars)}" ) from ex else: raise ValueError("The `variables` argument should be a list or a dictionary") + return ds_subset From f37161c827f1c85d13cc4144adca616b3622b61a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 08:20:10 +0000 Subject: [PATCH 03/68] Add derivation of cyclic encoded hour of day and day of year --- example.danra.yaml | 8 ++++ mllam_data_prep/ops/forcings.py | 78 +++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/example.danra.yaml b/example.danra.yaml index 0e6e7be..4f1c29a 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -83,6 +83,14 @@ inputs: - lat - lon method: derive_toa_radiation + - hour_of_day: + dependencies: + - time + method: derive_hour_of_day + - day_of_year: + dependencies: + - time + method: derive_day_of_year dim_mapping: time: method: rename diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 207ded5..b945e7e 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -118,3 +118,81 @@ def calc_toa_radiation(ds): toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) return toa_radiation + + +def derive_hour_of_day(ds): + """ + Derive hour of day features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive hour of day + + Returns + ------- + ds: xr.Dataset + The dataset with hour of day added + """ + logger.info("Calculating hour of day") + + # Get the hour of the day + hour_of_day = ds.time.dt.hour + + # Cyclic encoding of hour of day + hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) + + # Assign to the dataset + ds = ds.assign(hour_of_day_sin=hour_of_day_sin) + ds = ds.assign(hour_of_day_cos=hour_of_day_cos) + + return ds + + +def derive_day_of_year(ds): + """ + Derive day of year features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive day of year + + Returns + ------- + ds: xr.Dataset + The dataset with day of year added + """ + logger.info("Calculating day of year") + + # Get the day of year + day_of_year = ds.time.dt.dayofyear + + # Cyclic encoding of day of year - use 366 to include leap years! + day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) + + # Assign to the dataset + ds = ds.assign(day_of_year_sin=day_of_year_sin) + ds = ds.assign(day_of_year_cos=day_of_year_cos) + + return ds + + +def cyclic_encoding(da, da_max): + """Cyclic encoding of data + + Parameters + ---------- + data : xr.DataArray + xarray data-array of the variable which should be cyclically encoded + data_max: int/float + maximum value of the data variable + + Returns + ------- + """ + + da_sin = np.sin((da / da_max) * 2 * np.pi) + da_cos = np.cos((da / da_max) * 2 * np.pi) + + return da_cos, da_sin From 71afd3af125dc2c913471250a7b618c2ad20af58 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 11:55:01 +0000 Subject: [PATCH 04/68] Add derivation of cyclic encoded time of year --- mllam_data_prep/ops/forcings.py | 39 +++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index b945e7e..470109f 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -178,6 +178,45 @@ def derive_day_of_year(ds): return ds +def derive_time_of_year(ds): + logger.info("Calculating time of year") + + # Get the number of seconds a datetime corresponds to + number_of_seconds = ( + (ds.time.dt.dayofyear - 1) * 60 * 60 * 24 + + ds.time.dt.hour * 60 * 60 + + ds.time.dt.minute * 60 + + ds.time.dt.second + ) + + # Cyclic encoding using both leap year and non-leap year + # number of seconds to be able to choose later where to + # include which values using xr.where() + time_of_year_cos_non_leap, time_of_year_sin_non_leap = cyclic_encoding( + number_of_seconds, 31622400 + ) + time_of_year_cos_leap, time_of_year_sin_leap = cyclic_encoding( + number_of_seconds, 31536000 + ) + + time_of_year_cos = xr.where( + ds.time.dt.is_leap_year, + time_of_year_cos_leap, + time_of_year_cos_non_leap, + ) + time_of_year_sin = xr.where( + ds.time.dt.is_leap_year, + time_of_year_sin_leap, + time_of_year_sin_non_leap, + ) + + # Assign to the dataset + ds = ds.assign(time_of_year_sin=time_of_year_sin) + ds = ds.assign(time_of_year_cos=time_of_year_cos) + + return ds + + def cyclic_encoding(da, da_max): """Cyclic encoding of data From abb626b92cee9a5b7ee7d85e7ae30ceff1d4d79b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 6 Nov 2024 12:02:34 +0000 Subject: [PATCH 05/68] Update and add docstrings --- mllam_data_prep/ops/forcings.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 470109f..12e6396 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -179,6 +179,20 @@ def derive_day_of_year(ds): def derive_time_of_year(ds): + """ + Derive time of year features with a cyclic encoding + + Parameters + ---------- + ds : xr.Dataset + The dataset with variables needed to derive time of year + + Returns + ------- + ds: xr.Dataset + The dataset with time of year added + """ + logger.info("Calculating time of year") # Get the number of seconds a datetime corresponds to @@ -218,17 +232,22 @@ def derive_time_of_year(ds): def cyclic_encoding(da, da_max): - """Cyclic encoding of data + """ + Cyclic encoding of data Parameters ---------- - data : xr.DataArray - xarray data-array of the variable which should be cyclically encoded - data_max: int/float - maximum value of the data variable + da : xr.DataArray + xarray data-array that should be cyclically encoded + da_max: int/float + Maximum possible value of input data-array Returns ------- + da_cos: xr.DataArray + Cosine part of cyclically encoded input data-array + da_sin: xr.DataArray + Sine part of cyclically encoded input data-array """ da_sin = np.sin((da / da_max) * 2 * np.pi) From 8b1f18e19566a02499d83c8f8b51ea2163295621 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 09:30:57 +0000 Subject: [PATCH 06/68] Remove time_of_year --- mllam_data_prep/ops/forcings.py | 53 --------------------------------- 1 file changed, 53 deletions(-) diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 12e6396..5d5aa69 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -178,59 +178,6 @@ def derive_day_of_year(ds): return ds -def derive_time_of_year(ds): - """ - Derive time of year features with a cyclic encoding - - Parameters - ---------- - ds : xr.Dataset - The dataset with variables needed to derive time of year - - Returns - ------- - ds: xr.Dataset - The dataset with time of year added - """ - - logger.info("Calculating time of year") - - # Get the number of seconds a datetime corresponds to - number_of_seconds = ( - (ds.time.dt.dayofyear - 1) * 60 * 60 * 24 - + ds.time.dt.hour * 60 * 60 - + ds.time.dt.minute * 60 - + ds.time.dt.second - ) - - # Cyclic encoding using both leap year and non-leap year - # number of seconds to be able to choose later where to - # include which values using xr.where() - time_of_year_cos_non_leap, time_of_year_sin_non_leap = cyclic_encoding( - number_of_seconds, 31622400 - ) - time_of_year_cos_leap, time_of_year_sin_leap = cyclic_encoding( - number_of_seconds, 31536000 - ) - - time_of_year_cos = xr.where( - ds.time.dt.is_leap_year, - time_of_year_cos_leap, - time_of_year_cos_non_leap, - ) - time_of_year_sin = xr.where( - ds.time.dt.is_leap_year, - time_of_year_sin_leap, - time_of_year_sin_non_leap, - ) - - # Assign to the dataset - ds = ds.assign(time_of_year_sin=time_of_year_sin) - ds = ds.assign(time_of_year_cos=time_of_year_cos) - - return ds - - def cyclic_encoding(da, da_max): """ Cyclic encoding of data From 78540133a7a825ee71a8f9a8a86885e37398993e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:30:29 +0000 Subject: [PATCH 07/68] Provide the full namespace of the function --- example.danra.yaml | 2 +- mllam_data_prep/ops/forcings.py | 53 +++++++++++++++++++++++++++++++-- 2 files changed, 52 insertions(+), 3 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index 4f1c29a..65af8bf 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: - time - lat - lon - method: derive_toa_radiation + method: mllam_data_prep.ops.forcings.derive_toa_radiation - hour_of_day: dependencies: - time diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/ops/forcings.py index 5d5aa69..352d04f 100644 --- a/mllam_data_prep/ops/forcings.py +++ b/mllam_data_prep/ops/forcings.py @@ -1,3 +1,6 @@ +import importlib +import sys + import dask.array as da import numpy as np import xarray as xr @@ -29,8 +32,8 @@ def derive_forcings(ds, variables): else: logger.info("Deriving additional forcings") for _, derived_var in variables_to_derive.items(): - # Get the function defined in the config file - func = globals()[derived_var.method] + # Get the function + func = get_derived_variable_function(derived_var.method) # Currently, we're passing the whole dataset ds = func(ds) @@ -50,6 +53,52 @@ def derive_forcings(ds, variables): return ds +def get_derived_variable_function(function_namespace): + """ + Function for returning the function to be used to derive + the specified variable. + + 1. Check if the function to use is in globals() + 2. If it is in globals then call it + 3. If it isn't in globals() then import the necessary module + before calling it + """ + # Get the name of the calling module + calling_module = globals()["__name__"] + + if "." in function_namespace: + # If the function name is a full namespace, get module and function names + module_name, function_name = function_namespace.rsplit(".", 1) + + # Check if the module_name is pointing to here (the calling module), + # and if it does then use globals() to get the function otherwise + # import the correct module and get the correct function + if module_name == calling_module: + function = globals().get(function_name) + else: + # Check if the module is already imported + if module_name in sys.modules: + module = module_name + else: + module = importlib.import_module(module_name) + + # Get the function from the module + function = getattr(module, function_name) + else: + # If function name only get it from the calling module (here) + function = globals().get(function_namespace) + if not function: + raise TypeError( + f"Function '{function_namespace}' was not found in '{calling_module}'." + f" Check that you have specified the correct function name" + " and/or that you have defined the full function namespace if you" + " want to use a function defined outside of of the current module" + f" '{calling_module}'." + ) + + return function + + def derive_toa_radiation(ds): """ Derive approximate TOA radiation (instantaneous values [W*m**-2]) From 7fa90bf04963b35461a8dfcd130a6d86f37cce28 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:44:14 +0000 Subject: [PATCH 08/68] Rename the module with derived variables --- example.danra.yaml | 2 +- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/{ops/forcings.py => derived_variables.py} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename mllam_data_prep/{ops/forcings.py => derived_variables.py} (100%) diff --git a/example.danra.yaml b/example.danra.yaml index 65af8bf..50a67a7 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: - time - lat - lon - method: mllam_data_prep.ops.forcings.derive_toa_radiation + method: mllam_data_prep.derived_variables.derive_toa_radiation - hour_of_day: dependencies: - time diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index a6813c3..df814d1 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .ops.forcings import derive_forcings +from .derived_variables import derive_forcings from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs diff --git a/mllam_data_prep/ops/forcings.py b/mllam_data_prep/derived_variables.py similarity index 100% rename from mllam_data_prep/ops/forcings.py rename to mllam_data_prep/derived_variables.py From 48c9e3e73fc0d3066473c8823d1bc6bb42a754af Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 12 Nov 2024 13:45:50 +0000 Subject: [PATCH 09/68] Rename the function used for deriving variables --- mllam_data_prep/create_dataset.py | 6 +++--- mllam_data_prep/derived_variables.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index df814d1..df7939a 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .derived_variables import derive_forcings +from .derived_variables import derive_variables from .ops.loading import load_and_subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs @@ -128,8 +128,8 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - # Derive forcing variables (if applicable) - ds = derive_forcings(ds, variables) + # Derive variables (if applicable) + ds = derive_variables(ds, variables) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 352d04f..2f0a861 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,9 +7,9 @@ from loguru import logger -def derive_forcings(ds, variables): +def derive_variables(ds, variables): """ - Derive the specified forcings + Derive the specified variables Parameters --------- @@ -30,7 +30,7 @@ def derive_forcings(ds, variables): if variables_to_derive == {}: pass else: - logger.info("Deriving additional forcings") + logger.info("Deriving additional variables") for _, derived_var in variables_to_derive.items(): # Get the function func = get_derived_variable_function(derived_var.method) @@ -38,7 +38,7 @@ def derive_forcings(ds, variables): ds = func(ds) # Drop all the unneeded variables that have only been used to derive the - # forcing variables. HOWEVER, it's necessary to keep variables that are + # additional variables. HOWEVER, it's necessary to keep variables that are # also coordinates! derived_variable_dependencies = [] for _, derived_var in variables_to_derive.items(): From 8de9404911c8c21c59774251b4a93495d3799ad3 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 07:55:34 +0000 Subject: [PATCH 10/68] Redefine the config file for derived variables and how they are calculated --- example.danra.yaml | 22 +++---- mllam_data_prep/config.py | 20 +++---- mllam_data_prep/create_dataset.py | 30 ++++++---- mllam_data_prep/derived_variables.py | 90 +++++++++++++++------------- 4 files changed, 82 insertions(+), 80 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index 50a67a7..4152896 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -76,21 +76,13 @@ inputs: danra_additional_forcings: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] - variables: - - toa_radiation: - dependencies: - - time - - lat - - lon - method: mllam_data_prep.derived_variables.derive_toa_radiation - - hour_of_day: - dependencies: - - time - method: derive_hour_of_day - - day_of_year: - dependencies: - - time - method: derive_day_of_year + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.derive_toa_radiation dim_mapping: time: method: rename diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 354aaf9..be72de9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,17 +67,17 @@ class ValueSelection: @dataclass class DerivedVariable: """ - Defines a derived variables, where the dependencies (variables used - in the calculation) and the method (function for calculations) are - specified. + Defines a derived variables, where the kwargs (variables required + for the calculation) and the function (for calculating the variable) + are specified. Attributes: - dependencies: The variables to use in the calculation. - method: The methpd with which to derive the variable. + kwargs: Variables required for calculating the derived variable. + function: Function used to calculate the derived variable. """ - dependencies: List[str] - method: str = None + kwargs: Dict[str, str] + function: str @dataclass @@ -183,12 +183,10 @@ class InputDataset: path: str dims: List[str] - variables: Union[ - List[Union[str, Dict[str, DerivedVariable]]], - Dict[str, Dict[str, ValueSelection]], - ] dim_mapping: Dict[str, DimMapping] target_output_variable: str + variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None + derived_variables: Dict[str, DerivedVariable] = None attributes: Dict[str, Any] = None diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index df7939a..2b37b91 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -111,25 +111,33 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path variables = input_config.variables + derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims output_dims = output_config.variables[target_output_var] - logger.info(f"Loading dataset {dataset_name} from {path}") - try: - ds = load_and_subset_dataset(fp=path, variables=variables) - except Exception as ex: - raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) + if variables: + logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") + try: + ds = load_and_subset_dataset(fp=path, variables=variables) + except Exception as ex: + raise Exception( + f"Error loading dataset {dataset_name} from {path}" + ) from ex + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) # Derive variables (if applicable) - ds = derive_variables(ds, variables) + if derived_variables: + logger.info( + f"Loading dataset {dataset_name} from {path} and deriving variables" + ) + ds = derive_variables(fp=path, derived_variables=derived_variables) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 2f0a861..7453470 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,48 +7,47 @@ from loguru import logger -def derive_variables(ds, variables): +def derive_variables(fp, derived_variables): """ - Derive the specified variables + Load the dataset, and derive the specified variables Parameters --------- - ds : xr.Dataset - The loaded and subsetted dataset - variables: list/dict - List or dictionary with variables + fp : str + Filepath to the source dataset, for example the path to a zarr dataset + or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + derived_variables: dict + Dictionary with the variables to derive with keys as the variable names and + values with entries for kwargs and function to be used to derive them Returns ------- ds : xr.Dataset Dataset with derived variables included """ - variables_to_derive = { - k: v for elem in variables if isinstance(elem, dict) for (k, v) in elem.items() - } - - if variables_to_derive == {}: - pass - else: - logger.info("Deriving additional variables") - for _, derived_var in variables_to_derive.items(): - # Get the function - func = get_derived_variable_function(derived_var.method) - # Currently, we're passing the whole dataset - ds = func(ds) - - # Drop all the unneeded variables that have only been used to derive the - # additional variables. HOWEVER, it's necessary to keep variables that are - # also coordinates! - derived_variable_dependencies = [] - for _, derived_var in variables_to_derive.items(): - derived_variable_dependencies += derived_var.dependencies - variables_to_drop = [ - var - for var in derived_variable_dependencies - if var not in list(ds._coord_names) - ] - ds = ds.drop_vars(variables_to_drop) + logger.info("Deriving variables") + + try: + ds = xr.open_zarr(fp) + except ValueError: + ds = xr.open_dataset(fp) + + ds_subset = xr.Dataset() + ds_subset.attrs.update(ds.attrs) + # Iterate derived variables + for _, derived_variable in derived_variables.items(): + required_variables = derived_variable.kwargs + function_name = derived_variable.function + # Create the input dataset containing the required variables to derive + # the specified variable + ds_input = ds[required_variables.keys()] + kwargs = {v: ds_input[v] for v in required_variables.values()} + # Get the function to be used to derive the variable + func = get_derived_variable_function(function_name) + # Calculate the derived variable + derived_field = func(**kwargs) + # Add the derived variable(s) to the subsetted dataset + ds_subset[derived_field.name] = derived_field return ds @@ -99,37 +98,42 @@ def get_derived_variable_function(function_namespace): return function -def derive_toa_radiation(ds): +def derive_toa_radiation(lat, lon, time): """ Derive approximate TOA radiation (instantaneous values [W*m**-2]) Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive TOA radiation + lat : xr.DataArray + Latitude values + lon : xr.DataArray + Longitude values + time : xr.DataArray + Time Returns ------- - ds: xr.Dataset - The dataset with TOA radiation added + toa_radiation: xr.DataArray + TOA radiation data-array """ logger.info("Calculating top-of-atmosphere radiation") # Need to construct a new dataset with chunks since # lat and lon are coordinates and are therefore eagerly loaded ds_dict = {} - ds_dict["lat"] = (list(ds.lat.dims), da.from_array(ds.lat.values, chunks=(-1, -1))) - ds_dict["lon"] = (list(ds.lon.dims), da.from_array(ds.lon.values, chunks=(-1, -1))) - ds_dict["t"] = (list(ds.time.dims), da.from_array(ds.time.values, chunks=(10))) + ds_dict["lat"] = (list(lat.dims), da.from_array(lat.values, chunks=(-1, -1))) + ds_dict["lon"] = (list(lon.dims), da.from_array(lon.values, chunks=(-1, -1))) + ds_dict["t"] = (list(time.dims), da.from_array(time.values, chunks=(10))) ds_chunks = xr.Dataset(ds_dict) # Calculate TOA radiation toa_radiation = calc_toa_radiation(ds_chunks) - # Assign to the dataset - ds = ds.assign(toa_radiation=toa_radiation) + if isinstance(toa_radiation, xr.DataArray): + # Add attributes + toa_radiation.name = "toa_radiation" - return ds + return toa_radiation def calc_toa_radiation(ds): From ffc030cdf0ec3abeda73a599680de4c87b36a077 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 08:10:10 +0000 Subject: [PATCH 11/68] Remove derived variables from 'load_and_subset_dataset' --- mllam_data_prep/ops/loading.py | 20 ++------------------ 1 file changed, 2 insertions(+), 18 deletions(-) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 43b1372..e97360a 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -45,27 +45,11 @@ def load_and_subset_dataset(fp, variables): ) ds_subset[var] = da elif isinstance(variables, list): - # Check if the variables in a section are all derived variables or not - if all(isinstance(var, dict) for var in variables): - variables_to_extract = set() - for var in variables: - for _, var_dict in var.items(): - variables_to_extract.update(var_dict.dependencies) - elif all(isinstance(var, str) for var in variables): - variables_to_extract = variables - else: - raise TypeError( - "Expected either a list of strings or a list of dicts " - "but got a list of mixed types. If you are trying to derive " - "variables they should go in its own input section." - ) - - # Subset the dataset try: - ds_subset = ds[variables_to_extract] + ds_subset = ds[variables] except KeyError as ex: raise KeyError( - f"Could not find the all variables `{variables_to_extract}` in the dataset. " + f"Could not find the all variables `{variables}` in the dataset. " f"The available variables are {list(ds.data_vars)}" ) from ex else: From 692cdd33d467a0a364daa03b933a18e2cd9f9540 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 15 Nov 2024 08:12:41 +0000 Subject: [PATCH 12/68] Add try/except for derived variables when loading the dataset --- mllam_data_prep/create_dataset.py | 17 +++++++++++++---- mllam_data_prep/derived_variables.py | 7 ++++--- 2 files changed, 17 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 2b37b91..c3e3faf 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -110,8 +110,8 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path - variables = input_config.variables - derived_variables = input_config.derived_variables + variables = input_config.variables or None + derived_variables = input_config.derived_variables or None target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims @@ -132,12 +132,21 @@ def create_dataset(config: Config): dataset_name=dataset_name, ) - # Derive variables (if applicable) if derived_variables: logger.info( f"Loading dataset {dataset_name} from {path} and deriving variables" ) - ds = derive_variables(fp=path, derived_variables=derived_variables) + try: + ds = derive_variables(fp=path, derived_variables=derived_variables) + except Exception as ex: + raise Exception( + f"Error loading dataset {dataset_name} from {path}" + ) from ex + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) dim_mapping = input_config.dim_mapping diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 7453470..27075ce 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -16,9 +16,10 @@ def derive_variables(fp, derived_variables): fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) - derived_variables: dict - Dictionary with the variables to derive with keys as the variable names and - values with entries for kwargs and function to be used to derive them + derived_variables : dict + Dictionary with the variables to derive + with keys as the variable names and values with entries for + kwargs and function to use in the calculation Returns ------- From c0cd87541d475635034a8a0ab45018949df017dc Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 08:54:20 +0000 Subject: [PATCH 13/68] Chunk the input data with the defined output chunks --- mllam_data_prep/create_dataset.py | 12 +++- mllam_data_prep/derived_variables.py | 98 ++++++++++++---------------- mllam_data_prep/ops/loading.py | 8 ++- 3 files changed, 58 insertions(+), 60 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index c3e3faf..1a2f389 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -105,6 +105,7 @@ def create_dataset(config: Config): """ output_config = config.output output_coord_ranges = output_config.coord_ranges + chunking_config = config.output.chunking or {} dataarrays_by_target = defaultdict(list) @@ -121,7 +122,9 @@ def create_dataset(config: Config): if variables: logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") try: - ds = load_and_subset_dataset(fp=path, variables=variables) + ds = load_and_subset_dataset( + fp=path, variables=variables, chunking=chunking_config + ) except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" @@ -137,7 +140,11 @@ def create_dataset(config: Config): f"Loading dataset {dataset_name} from {path} and deriving variables" ) try: - ds = derive_variables(fp=path, derived_variables=derived_variables) + ds = derive_variables( + fp=path, + derived_variables=derived_variables, + chunking=chunking_config, + ) except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" @@ -196,7 +203,6 @@ def create_dataset(config: Config): # default to making a single chunk for each dimension if chunksize is not specified # in the config - chunking_config = config.output.chunking or {} logger.info(f"Chunking dataset with {chunking_config}") chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} ds = ds.chunk(chunks) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 27075ce..be05c72 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -1,13 +1,12 @@ import importlib import sys -import dask.array as da import numpy as np import xarray as xr from loguru import logger -def derive_variables(fp, derived_variables): +def derive_variables(fp, derived_variables, chunking): """ Load the dataset, and derive the specified variables @@ -20,6 +19,9 @@ def derive_variables(fp, derived_variables): Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in the calculation + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size Returns ------- @@ -35,19 +37,31 @@ def derive_variables(fp, derived_variables): ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) - # Iterate derived variables for _, derived_variable in derived_variables.items(): required_variables = derived_variable.kwargs function_name = derived_variable.function - # Create the input dataset containing the required variables to derive - # the specified variable ds_input = ds[required_variables.keys()] + + # Any coordinates needed for the derivation, for which chunking should be performed, + # should be converted to variables since it is not possible for coordinates to be + # chunked dask arrays + chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} + required_coordinates = [ + req_var for req_var in required_variables if req_var in ds_input.coords + ] + ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") + for req_var in required_variables.keys(): + if req_var in ds_input.coords and req_var in chunks: + ds_input = ds_input.reset_coords(req_var) + + # Chunk the data variables + ds_input = ds_input.chunk(chunks) + + # Calculate the derived variable kwargs = {v: ds_input[v] for v in required_variables.values()} - # Get the function to be used to derive the variable func = get_derived_variable_function(function_name) - # Calculate the derived variable derived_field = func(**kwargs) - # Add the derived variable(s) to the subsetted dataset + ds_subset[derived_field.name] = derived_field return ds @@ -99,78 +113,50 @@ def get_derived_variable_function(function_namespace): return function -def derive_toa_radiation(lat, lon, time): +def calculate_toa_radiation(lat, lon, time): """ - Derive approximate TOA radiation (instantaneous values [W*m**-2]) + Function for calculating top-of-the-atmosphere radiation Parameters ---------- - lat : xr.DataArray + lat : xr.DataArray or float Latitude values - lon : xr.DataArray + lon : xr.DataArray or float Longitude values - time : xr.DataArray + time : xr.DataArray or datetime object Time Returns ------- - toa_radiation: xr.DataArray - TOA radiation data-array + toa_radiation: xr.DataArray or float + TOA radiation data """ logger.info("Calculating top-of-atmosphere radiation") - # Need to construct a new dataset with chunks since - # lat and lon are coordinates and are therefore eagerly loaded - ds_dict = {} - ds_dict["lat"] = (list(lat.dims), da.from_array(lat.values, chunks=(-1, -1))) - ds_dict["lon"] = (list(lon.dims), da.from_array(lon.values, chunks=(-1, -1))) - ds_dict["t"] = (list(time.dims), da.from_array(time.values, chunks=(10))) - ds_chunks = xr.Dataset(ds_dict) - - # Calculate TOA radiation - toa_radiation = calc_toa_radiation(ds_chunks) - - if isinstance(toa_radiation, xr.DataArray): - # Add attributes - toa_radiation.name = "toa_radiation" - - return toa_radiation - - -def calc_toa_radiation(ds): - """ - Function for calculation top-of-the-atmosphere radiation - - Parameters - ---------- - ds : xr.Dataset - The dataset with variables needed to derive TOA radiation - - Returns - ------- - toa_radiation: xr.DataArray - TOA radiation data-array - """ # Solar constant E0 = 1366 # W*m**-2 - day = ds.t.dt.dayofyear - hr_utc = ds.t.dt.hour + day = time.dt.dayofyear + hr_utc = time.dt.hour # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) - hr_lst = hr_utc + ds.lon / 15 + hr_lst = hr_utc + lon / 15 hr_angle = 15 * (hr_lst - 12) # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. - cos_sza = np.sin(ds.lat * np.pi / 180) * np.sin(dec) + np.cos( - ds.lat * np.pi / 180 + cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( + lat * np.pi / 180 ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) # Where TOA radiation is negative, set to 0 toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + if isinstance(toa_radiation, xr.DataArray): + # Add attributes + toa_radiation.name = "toa_radiation" + return toa_radiation @@ -232,7 +218,7 @@ def derive_day_of_year(ds): return ds -def cyclic_encoding(da, da_max): +def cyclic_encoding(data_array, da_max): """ Cyclic encoding of data @@ -251,7 +237,7 @@ def cyclic_encoding(da, da_max): Sine part of cyclically encoded input data-array """ - da_sin = np.sin((da / da_max) * 2 * np.pi) - da_cos = np.cos((da / da_max) * 2 * np.pi) + data_array_sin = np.sin((data_array / da_max) * 2 * np.pi) + data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) - return da_cos, da_sin + return data_array_cos, data_array_sin diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index e97360a..fc5d5bc 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,7 +1,7 @@ import xarray as xr -def load_and_subset_dataset(fp, variables): +def load_and_subset_dataset(fp, variables, chunking): """ Load the dataset, subset the variables along the specified coordinates and check coordinate units @@ -15,6 +15,9 @@ def load_and_subset_dataset(fp, variables): Dictionary with the variables to subset with keys as the variable names and values with entries for each coordinate and coordinate values to extract + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size """ try: @@ -55,4 +58,7 @@ def load_and_subset_dataset(fp, variables): else: raise ValueError("The `variables` argument should be a list or a dictionary") + chunks = {d: chunking.get(d, int(ds_subset[d].count())) for d in ds_subset.dims} + ds_subset = ds_subset.chunk(chunks) + return ds_subset From 55224f34f91654b44680c277eb1f06e5df5a1a92 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:17:38 +0000 Subject: [PATCH 14/68] Update toa_radiation function name --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index 4152896..b351a52 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,7 +82,7 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.derive_toa_radiation + function: mllam_data_prep.derived_variables.calculate_toa_radiation dim_mapping: time: method: rename From 678ea523c39794f93831a6fe362fc6fba4b1c23d Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:20:02 +0000 Subject: [PATCH 15/68] Correct kwargs usage, add back dropped coordinates and return correct dataset --- mllam_data_prep/derived_variables.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index be05c72..114638e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -47,24 +47,28 @@ def derive_variables(fp, derived_variables, chunking): # chunked dask arrays chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} required_coordinates = [ - req_var for req_var in required_variables if req_var in ds_input.coords + req_var + for req_var in required_variables.keys() + if req_var in ds_input.coords ] ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") - for req_var in required_variables.keys(): - if req_var in ds_input.coords and req_var in chunks: - ds_input = ds_input.reset_coords(req_var) + for req_coord in required_coordinates: + if req_coord in chunks: + ds_input = ds_input.reset_coords(req_coord) # Chunk the data variables ds_input = ds_input.chunk(chunks) # Calculate the derived variable - kwargs = {v: ds_input[v] for v in required_variables.values()} + kwargs = {v: ds_input[k] for k, v in required_variables.items()} func = get_derived_variable_function(function_name) derived_field = func(**kwargs) - + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) ds_subset[derived_field.name] = derived_field - return ds + return ds_subset def get_derived_variable_function(function_namespace): @@ -113,6 +117,15 @@ def get_derived_variable_function(function_namespace): return function +def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): + """Return coordinates that have been reset.""" + for req_coord in required_coordinates: + if req_coord in chunks: + derived_field.coords[req_coord] = ds_input[req_coord] + + return derived_field + + def calculate_toa_radiation(lat, lon, time): """ Function for calculating top-of-the-atmosphere radiation From 9d2db079309b258dc41c66b8b81ef83dcda04f5e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 5 Dec 2024 14:22:54 +0000 Subject: [PATCH 16/68] Prepare for hour_of_day and day_of_year --- mllam_data_prep/derived_variables.py | 74 ++++++++++++++++++---------- 1 file changed, 48 insertions(+), 26 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 114638e..760e0b3 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -63,10 +63,20 @@ def derive_variables(fp, derived_variables, chunking): kwargs = {v: ds_input[k] for k, v in required_variables.items()} func = get_derived_variable_function(function_name) derived_field = func(**kwargs) - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) - ds_subset[derived_field.name] = derived_field + + # Some of the derived variables include two components, since + # they are cyclically encoded (cos and sin parts) + if isinstance(derived_field, xr.DataArray): + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) + ds_subset[derived_field.name] = derived_field + elif isinstance(derived_field, tuple): + for field in derived_field: + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) + ds_subset[field.name] = field return ds_subset @@ -173,62 +183,74 @@ def calculate_toa_radiation(lat, lon, time): return toa_radiation -def derive_hour_of_day(ds): +def calculate_hour_of_day(time): """ - Derive hour of day features with a cyclic encoding + Function for calculating hour of day features with a cyclic encoding Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive hour of day + time : xr.DataArray or datetime object + Time Returns ------- - ds: xr.Dataset - The dataset with hour of day added + hour_of_day_cos: xr.DataArray or float + cosine of the hour of day + hour_of_day_sin: xr.DataArray or float + sine of the hour of day """ logger.info("Calculating hour of day") # Get the hour of the day - hour_of_day = ds.time.dt.hour + hour_of_day = time.dt.hour # Cyclic encoding of hour of day hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) - # Assign to the dataset - ds = ds.assign(hour_of_day_sin=hour_of_day_sin) - ds = ds.assign(hour_of_day_cos=hour_of_day_cos) + if isinstance(hour_of_day_cos, xr.DataArray): + # Add attributes + hour_of_day_cos.name = "hour_of_day_cos" + + if isinstance(hour_of_day_sin, xr.DataArray): + # Add attributes + hour_of_day_sin.name = "hour_of_day_sin" - return ds + return hour_of_day_cos, hour_of_day_sin -def derive_day_of_year(ds): +def calculate_day_of_year(time): """ - Derive day of year features with a cyclic encoding + Function for calculating day of year features with a cyclic encoding Parameters ---------- - ds : xr.Dataset - The dataset with variables needed to derive day of year + time : xr.DataArray or datetime object + Time Returns ------- - ds: xr.Dataset - The dataset with day of year added + day_of_year_cos: xr.DataArray or float + cosine of the day of year + day_of_year_sin: xr.DataArray or float + sine of the day of year """ logger.info("Calculating day of year") # Get the day of year - day_of_year = ds.time.dt.dayofyear + day_of_year = time.dt.dayofyear # Cyclic encoding of day of year - use 366 to include leap years! day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) - # Assign to the dataset - ds = ds.assign(day_of_year_sin=day_of_year_sin) - ds = ds.assign(day_of_year_cos=day_of_year_cos) + if isinstance(day_of_year_cos, xr.DataArray): + # Add attributes + day_of_year_cos.name = "day_of_year_cos" + + if isinstance(day_of_year_sin, xr.DataArray): + # Add attributes + day_of_year_sin.name = "day_of_year_sin" - return ds + return day_of_year_cos, day_of_year_sin def cyclic_encoding(data_array, da_max): From 26455bc2aff82e40a2c04cd526afd9e2577457d6 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 6 Dec 2024 11:24:19 +0000 Subject: [PATCH 17/68] Add optional 'attributes' to the config of 'derived_variables' and check the attributes of the derived variable data-array --- mllam_data_prep/config.py | 10 +- mllam_data_prep/create_dataset.py | 1 + mllam_data_prep/derived_variables.py | 172 +++++++++++++++++++++++---- 3 files changed, 162 insertions(+), 21 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index be72de9..c6192d1 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -78,6 +78,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str + attributes: Dict[str, Any] = None @dataclass @@ -148,7 +149,8 @@ class InputDataset: 1) the path to the dataset, 2) the expected dimensions of the dataset, 3) the variables to select from the dataset (and optionally subsection - along the coordinates for each variable) and finally + along the coordinates for each variable) and/or the variables to derive + from the dataset, and finally 4) the method by which the dimensions and variables of the dataset are mapped to one of the output variables (this includes stacking of all the selected variables into a new single variable along a new coordinate, @@ -179,6 +181,12 @@ class InputDataset: (e.g. two datasets that coincide in space and time will only differ in the feature dimension, so the two will be combined by concatenating along the feature dimension). If a single shared coordinate cannot be found then an exception will be raised. + derived_variables: Dict[str, DerivedVariable] + Dictionary of variables to derive from the dataset, where the keys are the variable names and + the values are dictionaries defining the necessary function and kwargs. E.g. + `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` + would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which + takes `time`, `lat` and `lon` as arguments. """ path: str diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 1a2f389..4ce5e14 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -148,6 +148,7 @@ def create_dataset(config: Config): except Exception as ex: raise Exception( f"Error loading dataset {dataset_name} from {path}" + f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." ) from ex _check_dataset_attributes( ds=ds, diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 760e0b3..cda1bdf 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -40,6 +40,7 @@ def derive_variables(fp, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_variables = derived_variable.kwargs function_name = derived_variable.function + derived_variable_attributes = derived_variable.attributes or {} ds_input = ds[required_variables.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, @@ -61,35 +62,50 @@ def derive_variables(fp, derived_variables, chunking): # Calculate the derived variable kwargs = {v: ds_input[k] for k, v in required_variables.items()} - func = get_derived_variable_function(function_name) + func = _get_derived_variable_function(function_name) derived_field = func(**kwargs) - # Some of the derived variables include two components, since - # they are cyclically encoded (cos and sin parts) + # Check the derived field(s) + derived_field = _check_field( + derived_field, + derived_variable_attributes, + ds_input, + required_coordinates, + chunks, + ) + + # Add the derived field(s) to the subset if isinstance(derived_field, xr.DataArray): - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) ds_subset[derived_field.name] = derived_field - elif isinstance(derived_field, tuple): + elif isinstance(derived_field, tuple) and all( + isinstance(field, xr.DataArray) for field in derived_field + ): for field in derived_field: - field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks - ) ds_subset[field.name] = field + else: + raise TypeError( + "Expected an instance of xr.DataArray or tuple(xr.DataArray)," + f" but got {type(derived_field)}." + ) return ds_subset -def get_derived_variable_function(function_namespace): +def _get_derived_variable_function(function_namespace): """ - Function for returning the function to be used to derive + Function for getting the function for deriving the specified variable. - 1. Check if the function to use is in globals() - 2. If it is in globals then call it - 3. If it isn't in globals() then import the necessary module - before calling it + Parameters + ---------- + function_namespace: str + The full function namespace or just the function name + if it is a function included in this module. + + Returns + ------- + function: object + Function for deriving the specified variable """ # Get the name of the calling module calling_module = globals()["__name__"] @@ -127,13 +143,111 @@ def get_derived_variable_function(function_namespace): return function -def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): - """Return coordinates that have been reset.""" +def _check_field( + derived_field, derived_field_attributes, ds_input, required_coordinates, chunks +): + """ + Check the derived field. + + Parameters + ---------- + derived_field: xr.DataArray or tuple + The derived variable + derived_field_attributes: dict + Dictionary with attributes for the derived variables. + Defined in the config file. + ds_input: xr.Dataset + xarray dataset with variables needed to derive the specified variable + required_coordinates: list + List of coordinates required for deriving the specified variable + chunks: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size, only inbcluding the dimensions that are included + in the output as well. + + Returns + ------- + derived_field: xr.DataArray or tuple + The derived field + """ + if isinstance(derived_field, xr.DataArray): + derived_field = _check_attributes(derived_field, derived_field_attributes) + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) + elif isinstance(derived_field, tuple) and all( + isinstance(field, xr.DataArray) for field in derived_field + ): + for field in derived_field: + field = _check_attributes(field, derived_field_attributes) + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) + else: + raise TypeError( + "Expected an instance of xr.DataArray or tuple(xr.DataArray)," + f" but got {type(derived_field)}." + ) + + return derived_field + + +def _check_attributes(field, field_attributes): + """ + Check the attributes of the derived variable. + + Parameters + ---------- + field: xr.DataArray or tuple + The derived field + field_attributes: dict + Dictionary with attributes for the derived variables. + Defined in the config file. + + Returns + ------- + field: xr.DataArray or tuple + The derived field + """ + for attribute in ["units", "long_name"]: + if attribute not in field.attrs or field.attrs[attribute] is None: + if attribute in field_attributes.keys(): + field.attrs[attribute] = field_attributes[attribute] + else: + # The expected attributes are empty and the attributes have not been + # set during the calculation of the derived variable + raise ValueError( + f"The attribute '{attribute}' has not been set for the derived" + f" variable '{field.name}' (most likely because you are using a" + " function external to `mlllam-data-prep` to derive the field)." + " This attribute has not been defined in the 'attributes' section" + " of the config file either. Make sure that you add it to the" + f" 'attributes' section of the derived variable '{field.name}'." + ) + else: + if attribute in field_attributes.keys(): + logger.warning( + f"The attribute '{attribute}' of the derived field" + f" {field.name} is being overwritten from" + f" '{field.attrs[attribute]}' to" + f" '{field_attributes[attribute]}' according" + " to specification in the config file." + ) + field.attrs[attribute] = field_attributes[attribute] + else: + # Attributes are set and nothing has been defined in the config file + pass + + return field + + +def _return_dropped_coordinates(field, ds_input, required_coordinates, chunks): + """Return the coordinates that have been reset.""" for req_coord in required_coordinates: if req_coord in chunks: - derived_field.coords[req_coord] = ds_input[req_coord] + field.coords[req_coord] = ds_input[req_coord] - return derived_field + return field def calculate_toa_radiation(lat, lon, time): @@ -179,6 +293,8 @@ def calculate_toa_radiation(lat, lon, time): if isinstance(toa_radiation, xr.DataArray): # Add attributes toa_radiation.name = "toa_radiation" + toa_radiation.attrs["long_name"] = "top-of-the-atmosphere radiation" + toa_radiation.attrs["units"] = "W*m**-2" return toa_radiation @@ -210,10 +326,18 @@ def calculate_hour_of_day(time): if isinstance(hour_of_day_cos, xr.DataArray): # Add attributes hour_of_day_cos.name = "hour_of_day_cos" + hour_of_day_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded hour of day" + hour_of_day_cos.attrs["units"] = "1" if isinstance(hour_of_day_sin, xr.DataArray): # Add attributes hour_of_day_sin.name = "hour_of_day_sin" + hour_of_day_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded hour of day" + hour_of_day_sin.attrs["units"] = "1" return hour_of_day_cos, hour_of_day_sin @@ -245,10 +369,18 @@ def calculate_day_of_year(time): if isinstance(day_of_year_cos, xr.DataArray): # Add attributes day_of_year_cos.name = "day_of_year_cos" + day_of_year_cos.attrs[ + "long_name" + ] = "Cosine component of cyclically encoded day of year" + day_of_year_cos.attrs["units"] = "1" if isinstance(day_of_year_sin, xr.DataArray): # Add attributes day_of_year_sin.name = "day_of_year_sin" + day_of_year_sin.attrs[ + "long_name" + ] = "Sine component of cyclically encoded day of year" + day_of_year_sin.attrs["units"] = "1" return day_of_year_cos, day_of_year_sin From fbb606504b48df6fa1b7925ca52f67f721760812 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 07:42:16 +0000 Subject: [PATCH 18/68] Add dummy function for getting lat,lon (preparation for #33) --- mllam_data_prep/derived_variables.py | 35 +++++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index cda1bdf..b6b67db 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -38,19 +38,25 @@ def derive_variables(fp, derived_variables, chunking): ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) for _, derived_variable in derived_variables.items(): - required_variables = derived_variable.kwargs + required_kwargs = derived_variable.kwargs function_name = derived_variable.function derived_variable_attributes = derived_variable.attributes or {} - ds_input = ds[required_variables.keys()] + + # Separate the lat,lon from the required variables as these will be derived separately + latlon_coords_to_include = {} + for k, v in list(required_kwargs.items()): + if k in ["lat", "lon"]: + latlon_coords_to_include[k] = required_kwargs.pop(k) + + # Subset the dataset + ds_input = ds[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, - # should be converted to variables since it is not possible for coordinates to be - # chunked dask arrays + # should be converted to variables since it is not possible for *indexed* coordinates + # to be chunked dask arrays chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} required_coordinates = [ - req_var - for req_var in required_variables.keys() - if req_var in ds_input.coords + req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords ] ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") for req_coord in required_coordinates: @@ -60,9 +66,15 @@ def derive_variables(fp, derived_variables, chunking): # Chunk the data variables ds_input = ds_input.chunk(chunks) - # Calculate the derived variable - kwargs = {v: ds_input[k] for k, v in required_variables.items()} + # Add function arguments to kwargs + kwargs = {} + if len(latlon_coords_to_include): + latlon = get_latlon_coords_for_input(ds) + for k, v in latlon_coords_to_include.items(): + kwargs[v] = latlon[k] + kwargs.update({v: ds_input[k] for k, v in required_kwargs.items()}) func = _get_derived_variable_function(function_name) + # Calculate the derived variable derived_field = func(**kwargs) # Check the derived field(s) @@ -408,3 +420,8 @@ def cyclic_encoding(data_array, da_max): data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) return data_array_cos, data_array_sin + + +def get_latlon_coords_for_input(ds_input): + """Dummy function for getting lat and lon.""" + return ds_input[["lat", "lon"]].chunk(-1, -1) From 3a12f4839fcfaa72f11cc5e05736db7fcbb6cd0f Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 10:37:13 +0000 Subject: [PATCH 19/68] Add function for chunking data and checking the chunk size --- mllam_data_prep/derived_variables.py | 53 ++++++++++++++++++++++++++-- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index b6b67db..a985520 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -63,8 +63,8 @@ def derive_variables(fp, derived_variables, chunking): if req_coord in chunks: ds_input = ds_input.reset_coords(req_coord) - # Chunk the data variables - ds_input = ds_input.chunk(chunks) + # Chunk the dataset + ds_input = _chunk_dataset(ds_input, chunks) # Add function arguments to kwargs kwargs = {} @@ -103,6 +103,55 @@ def derive_variables(fp, derived_variables, chunking): return ds_subset +def _chunk_dataset(ds, chunks): + """ + Chunk dataset and check the chunk size. + + Parameters + ---------- + ds: xr.Dataset + Dataset to be chunked + chunks: dict + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds: xr.Dataset + Dataset with chunking applied + """ + # Define the memory limit check + memory_limit_check = 1 * 1024**3 # 1 GB + + # Check the chunk size + for var_name, var_data in ds.data_vars.items(): + total_size = 1 + + for dim, chunk_size in chunks.items(): + dim_size = ds.sizes.get(dim, None) + if dim_size is None: + raise KeyError(f"Dimension '{dim}' not found in the dataset.") + total_size *= chunk_size + + dtype = var_data.dtype + bytes_per_element = np.dtype(dtype).itemsize + + memory_usage = total_size * bytes_per_element + + if memory_usage > memory_limit_check: + logger.warning( + f"The chunk size for '{var_name}' exceeds '{memory_limit_check}' GB." + ) + + # Try chunking + try: + ds = ds.chunk(chunks) + except Exception as ex: + raise Exception(f"Error chunking dataset: {ex}") + + return ds + + def _get_derived_variable_function(function_namespace): """ Function for getting the function for deriving From 3ace21989c4642d29b3a24b4b0abcc91bb6522f7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 11:56:23 +0000 Subject: [PATCH 20/68] Add back coordinates on the subset instead of for each derived variable individually --- mllam_data_prep/derived_variables.py | 37 ++++++++-------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index a985520..d0fe2fa 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -78,13 +78,7 @@ def derive_variables(fp, derived_variables, chunking): derived_field = func(**kwargs) # Check the derived field(s) - derived_field = _check_field( - derived_field, - derived_variable_attributes, - ds_input, - required_coordinates, - chunks, - ) + derived_field = _check_field(derived_field, derived_variable_attributes) # Add the derived field(s) to the subset if isinstance(derived_field, xr.DataArray): @@ -100,6 +94,11 @@ def derive_variables(fp, derived_variables, chunking): f" but got {type(derived_field)}." ) + # Add back dropped coordinates + ds_subset = _return_dropped_coordinates( + ds_subset, ds_input, required_coordinates, chunks + ) + return ds_subset @@ -204,9 +203,7 @@ def _get_derived_variable_function(function_namespace): return function -def _check_field( - derived_field, derived_field_attributes, ds_input, required_coordinates, chunks -): +def _check_field(derived_field, derived_field_attributes): """ Check the derived field. @@ -217,14 +214,6 @@ def _check_field( derived_field_attributes: dict Dictionary with attributes for the derived variables. Defined in the config file. - ds_input: xr.Dataset - xarray dataset with variables needed to derive the specified variable - required_coordinates: list - List of coordinates required for deriving the specified variable - chunks: dict - Dictionary with keys as the dimensions to chunk along and values - with the chunk size, only inbcluding the dimensions that are included - in the output as well. Returns ------- @@ -233,17 +222,11 @@ def _check_field( """ if isinstance(derived_field, xr.DataArray): derived_field = _check_attributes(derived_field, derived_field_attributes) - derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks - ) elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_attributes(field, derived_field_attributes) - field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks - ) else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," @@ -302,13 +285,13 @@ def _check_attributes(field, field_attributes): return field -def _return_dropped_coordinates(field, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): """Return the coordinates that have been reset.""" for req_coord in required_coordinates: if req_coord in chunks: - field.coords[req_coord] = ds_input[req_coord] + ds_subset.coords[req_coord] = ds_input[req_coord] - return field + return ds_subset def calculate_toa_radiation(lat, lon, time): From a6b61b0ac6c00768a2c7c9f88bf11175fd3d3f3a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 11:57:05 +0000 Subject: [PATCH 21/68] Add 'hour_of_day' to example config --- example.danra.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/example.danra.yaml b/example.danra.yaml index b351a52..378c78a 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -83,6 +83,10 @@ inputs: lat: lat lon: lon function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename From 9dcace68963a359af644ed1219e24099c218d29a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Mon, 9 Dec 2024 13:15:15 +0000 Subject: [PATCH 22/68] Rename derived variables dataset section in the example config --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index bbf3dc7..9d3f1cf 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -73,7 +73,7 @@ inputs: name_format: "{var_name}" target_output_variable: forcing - danra_additional_forcings: + danra_derived_forcings: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] derived_variables: From aba675764def61e9797b80933236a5e9c3d5b2b9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 10 Dec 2024 07:35:20 +0000 Subject: [PATCH 23/68] Remove f-string from 'name_format' --- example.danra.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/example.danra.yaml b/example.danra.yaml index 9d3f1cf..f1fa443 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -96,7 +96,7 @@ inputs: dims: [x, y] forcing_feature: method: stack_variables_by_var_name - name_format: f"{var_name}" + name_format: "{var_name}" target_output_variable: forcing danra_lsm: From 143edb638a78ef897265b415fd4a176e06c2a491 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 10 Dec 2024 07:52:00 +0000 Subject: [PATCH 24/68] Update README --- README.md | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 5f5fcdf..7d9f947 100644 --- a/README.md +++ b/README.md @@ -187,6 +187,32 @@ inputs: name_format: "{var_name}" target_output_variable: forcing + danra_derived_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: forcing + danra_lsm: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/lsm.zarr dims: [x, y] @@ -286,15 +312,40 @@ inputs: grid_index: method: stack dims: [x, y] - target_architecture_variable: state + target_output_variable: state danra_surface: path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr dims: [time, x, y] variables: - # shouldn't really be using sea-surface pressure as "forcing", but don't - # have radiation varibles in danra yet - - pres_seasurface + # use surface incoming shortwave radiation as forcing + - swavr0m + dim_mapping: + time: + method: rename + dim: time + grid_index: + method: stack + dims: [x, y] + forcing_feature: + method: stack_variables_by_var_name + name_format: "{var_name}" + target_output_variable: forcing + + danra_derived_forcings: + path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr + dims: [time, x, y] + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + hour_of_day: + kwargs: + time: time + function: mllam_data_prep.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename @@ -305,7 +356,7 @@ inputs: forcing_feature: method: stack_variables_by_var_name name_format: "{var_name}" - target_architecture_variable: forcing + target_output_variable: forcing ... ``` @@ -315,11 +366,15 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `path`: the path to the source dataset. This can be a local path or a URL to e.g. a zarr dataset or netCDF file, anything that can be read by `xarray.open_dataset(...)`. - `dims`: the dimensions that the source dataset is expected to have. This is used to check that the source dataset has the expected dimensions and also makes it clearer in the config file what the dimensions of the source dataset are. - `variables`: selects which variables to extract from the source dataset. This may either be a list of variable names, or a dictionary where each key is the variable name and the value defines a dictionary of coordinates to do selection on. When doing selection you may also optionally define the units of the variable to check that the units of the variable match the units of the variable in the model architecture. -- `target_architecture_variable`: the variable in the model architecture that the source dataset should be mapped to. +- `target_output_variable`: the variable in the model architecture that the source dataset should be mapped to. - `dim_mapping`: defines how the dimensions of the source dataset should be mapped to the dimensions of the model architecture. This is done by defining a method to apply to each dimension. The methods are: - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with additional information. +- `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. +- `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. +- `attributes`: section where users can specify the attributes `units` and `long_name` as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. ### Config schema versioning From 12e057571e6e9bcc78a5bd7706198ea31536f0a2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 07:57:11 +0000 Subject: [PATCH 25/68] Update CHANGELOG --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbb8ea1..da3b7b6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added +- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34) - add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby ## [v0.5.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.5.0) From 000ce925f51b4c5add0d3c08f53ccc522f4cfb47 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 12:57:06 +0000 Subject: [PATCH 26/68] Make functions for deriving toa_radiation and datetime forcings actually handle both xr.DataArray and scalars --- mllam_data_prep/derived_variables.py | 35 ++++++++++++++++++++++++---- 1 file changed, 31 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index d0fe2fa..e5d0889 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -1,3 +1,4 @@ +import datetime import importlib import sys @@ -317,8 +318,18 @@ def calculate_toa_radiation(lat, lon, time): # Solar constant E0 = 1366 # W*m**-2 - day = time.dt.dayofyear - hr_utc = time.dt.hour + # Different handling if xr.DataArray or datetime object + if isinstance(time, xr.DataArray): + day = time.dt.dayofyear + hr_utc = time.dt.hour + elif isinstance(time, datetime.datetime): + day = time.timetuple().tm_yday + hr_utc = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) @@ -362,7 +373,15 @@ def calculate_hour_of_day(time): logger.info("Calculating hour of day") # Get the hour of the day - hour_of_day = time.dt.hour + if isinstance(time, xr.DataArray): + hour_of_day = time.dt.hour + elif isinstance(time, datetime.datetime): + hour_of_day = time.hour + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Cyclic encoding of hour of day hour_of_day_cos, hour_of_day_sin = cyclic_encoding(hour_of_day, 24) @@ -405,7 +424,15 @@ def calculate_day_of_year(time): logger.info("Calculating day of year") # Get the day of year - day_of_year = time.dt.dayofyear + if isinstance(time, xr.DataArray): + day_of_year = time.dt.dayofyear + elif isinstance(time, datetime.datetime): + day_of_year = time.timetuple().tm_yday + else: + raise TypeError( + "Expected an instance of xr.DataArray or datetime object," + f" but got {type(time)}." + ) # Cyclic encoding of day of year - use 366 to include leap years! day_of_year_cos, day_of_year_sin = cyclic_encoding(day_of_year, 366) From 0af6319922b075e8a44a89c6af091939fdfa89cc Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 11 Dec 2024 13:05:22 +0000 Subject: [PATCH 27/68] Update docstring and variable names in 'cyclic_encoding' --- mllam_data_prep/derived_variables.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index e5d0889..6217ade 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -456,29 +456,29 @@ def calculate_day_of_year(time): return day_of_year_cos, day_of_year_sin -def cyclic_encoding(data_array, da_max): +def cyclic_encoding(data, data_max): """ Cyclic encoding of data Parameters ---------- - da : xr.DataArray - xarray data-array that should be cyclically encoded - da_max: int/float - Maximum possible value of input data-array + data : xr.DataArray, float, or int + Data that should be cyclically encoded + data_max: int or float + Maximum possible value of input data. Should be greater than 0. Returns ------- - da_cos: xr.DataArray - Cosine part of cyclically encoded input data-array - da_sin: xr.DataArray - Sine part of cyclically encoded input data-array + data_cos: xr.DataArray, float, or int + Cosine part of cyclically encoded input data + data_sin: xr.DataArray, float, or int + Sine part of cyclically encoded input data """ - data_array_sin = np.sin((data_array / da_max) * 2 * np.pi) - data_array_cos = np.cos((data_array / da_max) * 2 * np.pi) + data_sin = np.sin((data / data_max) * 2 * np.pi) + data_cos = np.cos((data / data_max) * 2 * np.pi) - return data_array_cos, data_array_sin + return data_cos, data_sin def get_latlon_coords_for_input(ds_input): From 284db913e5dd7e190a764c075df1d96d2080a092 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:15:56 +0000 Subject: [PATCH 28/68] Add ranges to lat and lon in docstring --- mllam_data_prep/derived_variables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 6217ade..37ccc67 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -302,9 +302,9 @@ def calculate_toa_radiation(lat, lon, time): Parameters ---------- lat : xr.DataArray or float - Latitude values + Latitude values. Should be in the range [-90, 90] lon : xr.DataArray or float - Longitude values + Longitude values. Should be in the range [-180, 180] or [0, 360] time : xr.DataArray or datetime object Time From ba161d23c5948797465297576024baf7e8c335ad Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:20:22 +0000 Subject: [PATCH 29/68] Add github username to CHANGELOG entry --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index da3b7b6..c30bb81 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,7 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added -- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34) +- add ability to derive variables from input datasets [\#34](https://github.com/mllam/mllam-data-prep/pull/34), @ealerskans - add github PR template to guide development process on github [\#44](https://github.com/mllam/mllam-data-prep/pull/44), @leifdenby ## [v0.5.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.5.0) From e3d590cae070498d7240268d98c8515850b8480c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:24:19 +0000 Subject: [PATCH 30/68] Update DerivedVariable attributes to be Dict[str, str] --- mllam_data_prep/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index c6069f4..82bad84 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -66,7 +66,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, Any] = None + attributes: Dict[str, str] = None @dataclass From f8cae4ffb9bff918131f73c92a64e1812702800a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 10:27:24 +0000 Subject: [PATCH 31/68] Add missing attribute to docstring --- mllam_data_prep/config.py | 1 + 1 file changed, 1 insertion(+) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 82bad84..f20c3b2 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -62,6 +62,7 @@ class DerivedVariable: Attributes: kwargs: Variables required for calculating the derived variable. function: Function used to calculate the derived variable. + attributes: Attributes (e.g. `units` and `long_name`) for the derived variable. """ kwargs: Dict[str, str] From 8470c8263fcde06b46f606d7ae5045348ae41997 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 11:54:03 +0000 Subject: [PATCH 32/68] Change var names in 'calculate_toa_radiation' --- mllam_data_prep/derived_variables.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 37ccc67..57701f6 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -316,15 +316,15 @@ def calculate_toa_radiation(lat, lon, time): logger.info("Calculating top-of-atmosphere radiation") # Solar constant - E0 = 1366 # W*m**-2 + solar_constant = 1366 # W*m**-2 # Different handling if xr.DataArray or datetime object if isinstance(time, xr.DataArray): day = time.dt.dayofyear - hr_utc = time.dt.hour + hour_utc = time.dt.hour elif isinstance(time, datetime.datetime): day = time.timetuple().tm_yday - hr_utc = time.hour + hour_utc = time.hour else: raise TypeError( "Expected an instance of xr.DataArray or datetime object," @@ -332,18 +332,21 @@ def calculate_toa_radiation(lat, lon, time): ) # Eq. 1.6.1a in Solar Engineering of Thermal Processes 4th ed. + # dec: declination - angular position of the sun at solar noon w.r.t. + # the plane of the equator dec = np.pi / 180 * 23.45 * np.sin(2 * np.pi * (284 + day) / 365) - hr_lst = hr_utc + lon / 15 - hr_angle = 15 * (hr_lst - 12) + utc_solar_time = hour_utc + lon / 15 + hour_angle = 15 * (utc_solar_time - 12) # Eq. 1.6.2 with beta=0 in Solar Engineering of Thermal Processes 4th ed. + # cos_sza: Cosine of solar zenith angle cos_sza = np.sin(lat * np.pi / 180) * np.sin(dec) + np.cos( lat * np.pi / 180 - ) * np.cos(dec) * np.cos(hr_angle * np.pi / 180) + ) * np.cos(dec) * np.cos(hour_angle * np.pi / 180) # Where TOA radiation is negative, set to 0 - toa_radiation = xr.where(E0 * cos_sza < 0, 0, E0 * cos_sza) + toa_radiation = xr.where(solar_constant * cos_sza < 0, 0, solar_constant * cos_sza) if isinstance(toa_radiation, xr.DataArray): # Add attributes From 69afdd3d1d54e6c7c59b74c34ab2762c2cfd95a5 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:47:25 +0000 Subject: [PATCH 33/68] Remove unnecessary 'or None' --- mllam_data_prep/create_dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 1684e61..73ae043 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -127,8 +127,8 @@ def create_dataset(config: Config): for dataset_name, input_config in config.inputs.items(): path = input_config.path - variables = input_config.variables or None - derived_variables = input_config.derived_variables or None + variables = input_config.variables + derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable expected_input_attributes = input_config.attributes or {} expected_input_var_dims = input_config.dims From e17ed8b7d1f2ac654ee7bf22e5ab1f6172f3e69c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:51:24 +0000 Subject: [PATCH 34/68] Use var name 'dim' instead of 'd' --- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/derived_variables.py | 4 +++- mllam_data_prep/ops/loading.py | 4 +++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 73ae043..ce95b58 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -221,7 +221,7 @@ def create_dataset(config: Config): # default to making a single chunk for each dimension if chunksize is not specified # in the config logger.info(f"Chunking dataset with {chunking_config}") - chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} + chunks = {dim: chunking_config.get(dim, int(ds[dim].count())) for dim in ds.dims} ds = ds.chunk(chunks) splitting = config.output.splitting diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 57701f6..6ec212e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -55,7 +55,9 @@ def derive_variables(fp, derived_variables, chunking): # Any coordinates needed for the derivation, for which chunking should be performed, # should be converted to variables since it is not possible for *indexed* coordinates # to be chunked dask arrays - chunks = {d: chunking.get(d, int(ds_input[d].count())) for d in ds_input.dims} + chunks = { + dim: chunking.get(dim, int(ds_input[dim].count())) for dim in ds_input.dims + } required_coordinates = [ req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords ] diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index fc5d5bc..5275c57 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -58,7 +58,9 @@ def load_and_subset_dataset(fp, variables, chunking): else: raise ValueError("The `variables` argument should be a list or a dictionary") - chunks = {d: chunking.get(d, int(ds_subset[d].count())) for d in ds_subset.dims} + chunks = { + dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims + } ds_subset = ds_subset.chunk(chunks) return ds_subset From 23b119f7bd3972223de7410d028ab8bc42ccecac Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 12:56:36 +0000 Subject: [PATCH 35/68] Use var names 'key, val' instead of 'k, v' --- mllam_data_prep/create_dataset.py | 7 +++++-- mllam_data_prep/derived_variables.py | 12 ++++++------ 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index ce95b58..b013ac3 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -31,11 +31,14 @@ def _check_dataset_attributes(ds, expected_attributes, dataset_name): # check for attributes having the wrong value incorrect_attributes = { - k: v for k, v in expected_attributes.items() if ds.attrs[k] != v + key: val for key, val in expected_attributes.items() if ds.attrs[key] != val } if len(incorrect_attributes) > 0: s_list = "\n".join( - [f"{k}: {v} != {ds.attrs[k]}" for k, v in incorrect_attributes.items()] + [ + f"{key}: {val} != {ds.attrs[key]}" + for key, val in incorrect_attributes.items() + ] ) raise ValueError( f"Dataset {dataset_name} has the following incorrect attributes: {s_list}" diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 6ec212e..f4bc516 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -45,9 +45,9 @@ def derive_variables(fp, derived_variables, chunking): # Separate the lat,lon from the required variables as these will be derived separately latlon_coords_to_include = {} - for k, v in list(required_kwargs.items()): - if k in ["lat", "lon"]: - latlon_coords_to_include[k] = required_kwargs.pop(k) + for key in list(required_kwargs.keys()): + if key in ["lat", "lon"]: + latlon_coords_to_include[key] = required_kwargs.pop(key) # Subset the dataset ds_input = ds[required_kwargs.keys()] @@ -73,9 +73,9 @@ def derive_variables(fp, derived_variables, chunking): kwargs = {} if len(latlon_coords_to_include): latlon = get_latlon_coords_for_input(ds) - for k, v in latlon_coords_to_include.items(): - kwargs[v] = latlon[k] - kwargs.update({v: ds_input[k] for k, v in required_kwargs.items()}) + for key, val in latlon_coords_to_include.items(): + kwargs[val] = latlon[key] + kwargs.update({val: ds_input[key] for key, val in required_kwargs.items()}) func = _get_derived_variable_function(function_name) # Calculate the derived variable derived_field = func(**kwargs) From 2ce53c7a485549ae54f09262eac8e59692f37629 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 13:35:02 +0000 Subject: [PATCH 36/68] Move '_check_dataset_attributes' outside if statement --- mllam_data_prep/create_dataset.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index b013ac3..f38b619 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -148,11 +148,6 @@ def create_dataset(config: Config): raise Exception( f"Error loading dataset {dataset_name} from {path}" ) from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) if derived_variables: logger.info( @@ -169,11 +164,11 @@ def create_dataset(config: Config): f"Error loading dataset {dataset_name} from {path}" f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." ) from ex - _check_dataset_attributes( - ds=ds, - expected_attributes=expected_input_attributes, - dataset_name=dataset_name, - ) + _check_dataset_attributes( + ds=ds, + expected_attributes=expected_input_attributes, + dataset_name=dataset_name, + ) dim_mapping = input_config.dim_mapping From f1e3d778cbe034203c4b455278d65a11215af434 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 12 Dec 2024 13:49:59 +0000 Subject: [PATCH 37/68] Set '{}' as default for 'attributes' and 'chunking' --- mllam_data_prep/config.py | 6 +++--- mllam_data_prep/create_dataset.py | 4 ++-- mllam_data_prep/derived_variables.py | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index f20c3b2..9bbc783 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,7 +67,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, str] = None + attributes: Dict[str, str] = field(default_factory=dict) @dataclass @@ -184,7 +184,7 @@ class InputDataset: target_output_variable: str variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None derived_variables: Dict[str, DerivedVariable] = None - attributes: Dict[str, Any] = None + attributes: Dict[str, Any] = field(default_factory=dict) @dataclass @@ -284,7 +284,7 @@ class Output: variables: Dict[str, List[str]] coord_ranges: Dict[str, Range] = None - chunking: Dict[str, int] = None + chunking: Dict[str, int] = field(default_factory=dict) splitting: Splitting = None diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index f38b619..113b703 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -124,7 +124,7 @@ def create_dataset(config: Config): output_config = config.output output_coord_ranges = output_config.coord_ranges - chunking_config = config.output.chunking or {} + chunking_config = config.output.chunking dataarrays_by_target = defaultdict(list) @@ -133,7 +133,7 @@ def create_dataset(config: Config): variables = input_config.variables derived_variables = input_config.derived_variables target_output_var = input_config.target_output_variable - expected_input_attributes = input_config.attributes or {} + expected_input_attributes = input_config.attributes expected_input_var_dims = input_config.dims output_dims = output_config.variables[target_output_var] diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f4bc516..f881bcb 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -41,7 +41,7 @@ def derive_variables(fp, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attributes or {} + derived_variable_attributes = derived_variable.attributes # Separate the lat,lon from the required variables as these will be derived separately latlon_coords_to_include = {} From 2afbb356bde396448dd4273de92a1837c83758c3 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:32:28 +0000 Subject: [PATCH 38/68] Make types more explicit --- mllam_data_prep/derived_variables.py | 69 ++++++++++++++++++---------- 1 file changed, 44 insertions(+), 25 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f881bcb..4669a36 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -16,17 +16,17 @@ def derive_variables(fp, derived_variables, chunking): fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) - derived_variables : dict + derived_variables : Dict[str, DerivedVariable] Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in the calculation - chunking: dict + chunking: Dict[str, int] Dictionary with keys as the dimensions to chunk along and values with the chunk size Returns ------- - ds : xr.Dataset + xr.Dataset Dataset with derived variables included """ logger.info("Deriving variables") @@ -113,7 +113,7 @@ def _chunk_dataset(ds, chunks): ---------- ds: xr.Dataset Dataset to be chunked - chunks: dict + chunks: Dict[str, int] Dictionary with keys as dimensions to be chunked and chunk sizes as the values @@ -212,15 +212,15 @@ def _check_field(derived_field, derived_field_attributes): Parameters ---------- - derived_field: xr.DataArray or tuple + derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived variable - derived_field_attributes: dict + derived_field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. Defined in the config file. Returns ------- - derived_field: xr.DataArray or tuple + derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field """ if isinstance(derived_field, xr.DataArray): @@ -245,15 +245,15 @@ def _check_attributes(field, field_attributes): Parameters ---------- - field: xr.DataArray or tuple + field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field - field_attributes: dict + field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. Defined in the config file. Returns ------- - field: xr.DataArray or tuple + field: Union[xr.DataArray, Tuple[xr.DataArray]] The derived field """ for attribute in ["units", "long_name"]: @@ -289,7 +289,26 @@ def _check_attributes(field, field_attributes): def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): - """Return the coordinates that have been reset.""" + """ + Return the coordinates that have been reset. + + Parameters + ---------- + ds_subset: xr.Dataset + Subsetted dataset with derived variables + ds_input: xr.Dataset + Input dataset for deriving variables + required_coordinates: List[str] + List of coordinates required for the derived variable + chunks: Dict[str, int] + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds_subset: xr.Dataset + Subsetted dataset with dropped coordinates returned + """ for req_coord in required_coordinates: if req_coord in chunks: ds_subset.coords[req_coord] = ds_input[req_coord] @@ -303,16 +322,16 @@ def calculate_toa_radiation(lat, lon, time): Parameters ---------- - lat : xr.DataArray or float + lat : Union[xr.DataArray, float] Latitude values. Should be in the range [-90, 90] - lon : xr.DataArray or float + lon : Union[xr.DataArray, float] Longitude values. Should be in the range [-180, 180] or [0, 360] - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - toa_radiation: xr.DataArray or float + toa_radiation : Union[xr.DataArray, float] TOA radiation data """ logger.info("Calculating top-of-atmosphere radiation") @@ -365,14 +384,14 @@ def calculate_hour_of_day(time): Parameters ---------- - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - hour_of_day_cos: xr.DataArray or float + hour_of_day_cos: Union[xr.DataArray, float] cosine of the hour of day - hour_of_day_sin: xr.DataArray or float + hour_of_day_sin: Union[xr.DataArray, float] sine of the hour of day """ logger.info("Calculating hour of day") @@ -416,14 +435,14 @@ def calculate_day_of_year(time): Parameters ---------- - time : xr.DataArray or datetime object + time : Union[xr.DataArray, datetime.datetime] Time Returns ------- - day_of_year_cos: xr.DataArray or float + day_of_year_cos: Union[xr.DataArray, float] cosine of the day of year - day_of_year_sin: xr.DataArray or float + day_of_year_sin: Union[xr.DataArray, float] sine of the day of year """ logger.info("Calculating day of year") @@ -467,16 +486,16 @@ def cyclic_encoding(data, data_max): Parameters ---------- - data : xr.DataArray, float, or int + data : Union[xr.DataArray, float, int] Data that should be cyclically encoded - data_max: int or float + data_max: Union[int, float] Maximum possible value of input data. Should be greater than 0. Returns ------- - data_cos: xr.DataArray, float, or int + data_cos: Union[xr.DataArray, float, int] Cosine part of cyclically encoded input data - data_sin: xr.DataArray, float, or int + data_sin: Union[xr.DataArray, float, int] Sine part of cyclically encoded input data """ From 75797a206b8be9da6aa7b5376852ac6744524050 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:40:26 +0000 Subject: [PATCH 39/68] Rename 'ds_subset' to 'ds_derived_vars' and update comment for 'ds_input' --- mllam_data_prep/derived_variables.py | 34 +++++++++++++++------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4669a36..a1898c1 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -36,8 +36,8 @@ def derive_variables(fp, derived_variables, chunking): except ValueError: ds = xr.open_dataset(fp) - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) + ds_derived_vars = xr.Dataset() + ds_derived_vars.attrs.update(ds.attrs) for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function @@ -49,7 +49,7 @@ def derive_variables(fp, derived_variables, chunking): if key in ["lat", "lon"]: latlon_coords_to_include[key] = required_kwargs.pop(key) - # Subset the dataset + # Get input dataset for calculating derived variables ds_input = ds[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, @@ -83,14 +83,14 @@ def derive_variables(fp, derived_variables, chunking): # Check the derived field(s) derived_field = _check_field(derived_field, derived_variable_attributes) - # Add the derived field(s) to the subset + # Add the derived field(s) to the dataset if isinstance(derived_field, xr.DataArray): - ds_subset[derived_field.name] = derived_field + ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: - ds_subset[field.name] = field + ds_derived_vars[field.name] = field else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," @@ -98,11 +98,11 @@ def derive_variables(fp, derived_variables, chunking): ) # Add back dropped coordinates - ds_subset = _return_dropped_coordinates( - ds_subset, ds_input, required_coordinates, chunks + ds_derived_vars = _return_dropped_coordinates( + ds_derived_vars, ds_input, required_coordinates, chunks ) - return ds_subset + return ds_derived_vars def _chunk_dataset(ds, chunks): @@ -288,14 +288,16 @@ def _check_attributes(field, field_attributes): return field -def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates( + ds_derived_vars, ds_input, required_coordinates, chunks +): """ Return the coordinates that have been reset. Parameters ---------- - ds_subset: xr.Dataset - Subsetted dataset with derived variables + ds_derived_vars: xr.Dataset + Dataset with derived variables ds_input: xr.Dataset Input dataset for deriving variables required_coordinates: List[str] @@ -306,14 +308,14 @@ def _return_dropped_coordinates(ds_subset, ds_input, required_coordinates, chunk Returns ------- - ds_subset: xr.Dataset - Subsetted dataset with dropped coordinates returned + ds_derived_vars: xr.Dataset + Dataset with derived variables, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - ds_subset.coords[req_coord] = ds_input[req_coord] + ds_derived_vars.coords[req_coord] = ds_input[req_coord] - return ds_subset + return ds_derived_vars def calculate_toa_radiation(lat, lon, time): From 31578e81abc44725a38080b900fb9055e188274e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 07:56:58 +0000 Subject: [PATCH 40/68] Add 'Optional[...]' to optional attributes --- mllam_data_prep/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 9bbc783..0029313 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -67,7 +67,7 @@ class DerivedVariable: kwargs: Dict[str, str] function: str - attributes: Dict[str, str] = field(default_factory=dict) + attributes: Optional[Dict[str, str]] = field(default_factory=dict) @dataclass @@ -182,9 +182,9 @@ class InputDataset: dims: List[str] dim_mapping: Dict[str, DimMapping] target_output_variable: str - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] = None - derived_variables: Dict[str, DerivedVariable] = None - attributes: Dict[str, Any] = field(default_factory=dict) + variables: Optional[Union[List[str], Dict[str, Dict[str, ValueSelection]]]] = None + derived_variables: Optional[Dict[str, DerivedVariable]] = None + attributes: Optional[Dict[str, Any]] = field(default_factory=dict) @dataclass From 90e4cf2cb2046280b646975ead8ff9e12b7e10e6 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 08:39:02 +0000 Subject: [PATCH 41/68] Move loading of dataset to a separate function --- mllam_data_prep/create_dataset.py | 27 +++++++++++++--------- mllam_data_prep/derived_variables.py | 19 +++++----------- mllam_data_prep/ops/loading.py | 34 +++++++++++++++++++++------- 3 files changed, 48 insertions(+), 32 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 113b703..181292a 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,7 +11,7 @@ from . import __version__ from .config import Config, InvalidConfigException from .derived_variables import derive_variables -from .ops.loading import load_and_subset_dataset +from .ops.loading import load_dataset, subset_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs from .ops.statistics import calc_stats @@ -138,32 +138,37 @@ def create_dataset(config: Config): output_dims = output_config.variables[target_output_var] + logger.info(f"Loading dataset {dataset_name} from {path}") + try: + ds_source = load_dataset(fp=path) + except Exception as ex: + raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex + if variables: - logger.info(f"Loading dataset {dataset_name} from {path} and subsetting") + logger.info(f"Subsetting dataset {dataset_name}") try: - ds = load_and_subset_dataset( - fp=path, variables=variables, chunking=chunking_config + ds = subset_dataset( + ds=ds_source, variables=variables, chunking=chunking_config ) except Exception as ex: raise Exception( - f"Error loading dataset {dataset_name} from {path}" + f"Error subsetting dataset {dataset_name} from {path}" ) from ex if derived_variables: - logger.info( - f"Loading dataset {dataset_name} from {path} and deriving variables" - ) + logger.info(f"Deriving variables from {dataset_name}") try: ds = derive_variables( - fp=path, + ds=ds_source, derived_variables=derived_variables, chunking=chunking_config, ) except Exception as ex: raise Exception( - f"Error loading dataset {dataset_name} from {path}" - f" or deriving variables '{', '.join(list(derived_variables.keys()))}'." + f"Error deriving variables '{', '.join(list(derived_variables.keys()))}'" + f" from dataset {dataset_name} from {path}" ) from ex + _check_dataset_attributes( ds=ds, expected_attributes=expected_input_attributes, diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index a1898c1..8091e64 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -7,19 +7,18 @@ from loguru import logger -def derive_variables(fp, derived_variables, chunking): +def derive_variables(ds, derived_variables, chunking): """ Load the dataset, and derive the specified variables Parameters --------- - fp : str - Filepath to the source dataset, for example the path to a zarr dataset - or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + ds : xr.Dataset + Source dataset derived_variables : Dict[str, DerivedVariable] - Dictionary with the variables to derive - with keys as the variable names and values with entries for - kwargs and function to use in the calculation + Dictionary with the variables to derive with keys as the variable + names and values with entries for kwargs and function to use in + the calculation chunking: Dict[str, int] Dictionary with keys as the dimensions to chunk along and values with the chunk size @@ -29,12 +28,6 @@ def derive_variables(fp, derived_variables, chunking): xr.Dataset Dataset with derived variables included """ - logger.info("Deriving variables") - - try: - ds = xr.open_zarr(fp) - except ValueError: - ds = xr.open_dataset(fp) ds_derived_vars = xr.Dataset() ds_derived_vars.attrs.update(ds.attrs) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index 5275c57..a8c2d24 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,16 +1,39 @@ import xarray as xr -def load_and_subset_dataset(fp, variables, chunking): +def load_dataset(fp): """ - Load the dataset, subset the variables along the specified coordinates and - check coordinate units + Load the dataset Parameters ---------- fp : str Filepath to the source dataset, for example the path to a zarr dataset or a netCDF file (anything that is supported by `xarray.open_dataset` will work) + + Returns + ------- + ds: xr.Dataset + Source dataset + """ + + try: + ds = xr.open_zarr(fp) + except ValueError: + ds = xr.open_dataset(fp) + + return ds + + +def subset_dataset(ds, variables, chunking): + """ + Load the dataset, subset the variables along the specified coordinates and + check coordinate units + + Parameters + ---------- + ds : xr.Dataset + Source dataset variables : dict Dictionary with the variables to subset with keys as the variable names and values with entries for each @@ -20,11 +43,6 @@ def load_and_subset_dataset(fp, variables, chunking): with the chunk size """ - try: - ds = xr.open_zarr(fp) - except ValueError: - ds = xr.open_dataset(fp) - ds_subset = xr.Dataset() ds_subset.attrs.update(ds.attrs) if isinstance(variables, dict): From 717c6a526215e1961d93626d5298b755427ae2e9 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 08:46:05 +0000 Subject: [PATCH 42/68] Simplify if loops --- mllam_data_prep/derived_variables.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 8091e64..cce450e 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -264,19 +264,18 @@ def _check_attributes(field, field_attributes): " of the config file either. Make sure that you add it to the" f" 'attributes' section of the derived variable '{field.name}'." ) + elif attribute in field_attributes.keys(): + logger.warning( + f"The attribute '{attribute}' of the derived field" + f" {field.name} is being overwritten from" + f" '{field.attrs[attribute]}' to" + f" '{field_attributes[attribute]}' according" + " to specification in the config file." + ) + field.attrs[attribute] = field_attributes[attribute] else: - if attribute in field_attributes.keys(): - logger.warning( - f"The attribute '{attribute}' of the derived field" - f" {field.name} is being overwritten from" - f" '{field.attrs[attribute]}' to" - f" '{field_attributes[attribute]}' according" - " to specification in the config file." - ) - field.attrs[attribute] = field_attributes[attribute] - else: - # Attributes are set and nothing has been defined in the config file - pass + # Attributes are set and nothing has been defined in the config file + pass return field From 2856c6b9acf932e8b949be0e9dcc1e7de8ef7c9a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:24:16 +0000 Subject: [PATCH 43/68] Update '_get_derived_variable_function' --- mllam_data_prep/derived_variables.py | 42 +++++++++++++++------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index cce450e..4f64509 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -166,27 +166,20 @@ def _get_derived_variable_function(function_namespace): # Get the name of the calling module calling_module = globals()["__name__"] - if "." in function_namespace: - # If the function name is a full namespace, get module and function names - module_name, function_name = function_namespace.rsplit(".", 1) - - # Check if the module_name is pointing to here (the calling module), - # and if it does then use globals() to get the function otherwise - # import the correct module and get the correct function - if module_name == calling_module: - function = globals().get(function_name) - else: - # Check if the module is already imported - if module_name in sys.modules: - module = module_name - else: - module = importlib.import_module(module_name) - - # Get the function from the module - function = getattr(module, function_name) + # Get module and function names + function_namespace_list = function_namespace.rsplit(".") + if len(function_namespace_list) > 1: + function_name = function_namespace_list[-1] + module_name = ".".join(elem for elem in function_namespace_list[:-1]) else: - # If function name only get it from the calling module (here) - function = globals().get(function_namespace) + module_name = "" + function_name = function_namespace_list[0] + + # Check if the module_name is pointing to here (the calling module or empty "") + # If it does, then use globals() to get the function otherwise import the + # correct module and get the correct function + if module_name in [calling_module, ""]: + function = globals().get(function_name) if not function: raise TypeError( f"Function '{function_namespace}' was not found in '{calling_module}'." @@ -195,6 +188,15 @@ def _get_derived_variable_function(function_namespace): " want to use a function defined outside of of the current module" f" '{calling_module}'." ) + else: + # Check if the module is already imported + if module_name in sys.modules: + module = module_name + else: + module = importlib.import_module(module_name) + + # Get the function from the module + function = getattr(module, function_name) return function From 98673ee227e2779a1ed2c9f5771dcce91524b7c7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:46:24 +0000 Subject: [PATCH 44/68] Simplify checks of the derived fields --- mllam_data_prep/derived_variables.py | 42 ++++------------------------ 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4f64509..586f412 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -73,16 +73,17 @@ def derive_variables(ds, derived_variables, chunking): # Calculate the derived variable derived_field = func(**kwargs) - # Check the derived field(s) - derived_field = _check_field(derived_field, derived_variable_attributes) - - # Add the derived field(s) to the dataset + # Check the derived field(s) and add it to the dataset if isinstance(derived_field, xr.DataArray): + derived_field = _check_attributes( + derived_field, derived_variable_attributes + ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: + field = _check_attributes(field, derived_variable_attributes) ds_derived_vars[field.name] = field else: raise TypeError( @@ -201,39 +202,6 @@ def _get_derived_variable_function(function_namespace): return function -def _check_field(derived_field, derived_field_attributes): - """ - Check the derived field. - - Parameters - ---------- - derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] - The derived variable - derived_field_attributes: Dict[str, str] - Dictionary with attributes for the derived variables. - Defined in the config file. - - Returns - ------- - derived_field: Union[xr.DataArray, Tuple[xr.DataArray]] - The derived field - """ - if isinstance(derived_field, xr.DataArray): - derived_field = _check_attributes(derived_field, derived_field_attributes) - elif isinstance(derived_field, tuple) and all( - isinstance(field, xr.DataArray) for field in derived_field - ): - for field in derived_field: - field = _check_attributes(field, derived_field_attributes) - else: - raise TypeError( - "Expected an instance of xr.DataArray or tuple(xr.DataArray)," - f" but got {type(derived_field)}." - ) - - return derived_field - - def _check_attributes(field, field_attributes): """ Check the attributes of the derived variable. From 8940e82c9383486af88e5c14d16f2457c5f2c50f Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 09:53:06 +0000 Subject: [PATCH 45/68] Issue warning saying that we assume coordinates are named 'lat' and 'lon' --- mllam_data_prep/derived_variables.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 586f412..1b275fd 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -37,6 +37,10 @@ def derive_variables(ds, derived_variables, chunking): derived_variable_attributes = derived_variable.attributes # Separate the lat,lon from the required variables as these will be derived separately + logger.warning( + "Assuming that the lat/lon coordinates are given as variables called" + " 'lat' and 'lon'." + ) latlon_coords_to_include = {} for key in list(required_kwargs.keys()): if key in ["lat", "lon"]: From e12e328534bd6ae81b4300ac15a7a8a91a2cfa8a Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:11:56 +0000 Subject: [PATCH 46/68] Update README to make it clear that 'attributes' is associated with 'derived_variables' --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 7d9f947..de089dc 100644 --- a/README.md +++ b/README.md @@ -371,10 +371,10 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. -- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with additional information. -- `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. -- `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. -- `attributes`: section where users can specify the attributes `units` and `long_name` as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. + - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. + - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. + - `attributes`: section where users can specify attributes (e.g. `units` and `long_name`) as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. ### Config schema versioning From ecdea30e323e85f893a4e446ea8eddbb946c2a4b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:17:45 +0000 Subject: [PATCH 47/68] Indicate that 'variables' and 'derived_variables' are mutually exclusive --- mllam_data_prep/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 0029313..1190088 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -138,7 +138,7 @@ class InputDataset: 1) the path to the dataset, 2) the expected dimensions of the dataset, 3) the variables to select from the dataset (and optionally subsection - along the coordinates for each variable) and/or the variables to derive + along the coordinates for each variable) or the variables to derive from the dataset, and finally 4) the method by which the dimensions and variables of the dataset are mapped to one of the output variables (this includes stacking of all From e3c0f223575b16cc9aac94b82c87600a7c885c7c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 12:31:18 +0000 Subject: [PATCH 48/68] Update docstring of 'InputDataset' class --- mllam_data_prep/config.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 1190088..2bc42a9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -153,11 +153,6 @@ class InputDataset: dims: List[str] List of the expected dimensions of the dataset. E.g. `["time", "x", "y"]`. These will be checked to ensure consistency of the dataset being read. - variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] - List of the variables to select from the dataset. E.g. `["temperature", "precipitation"]` - or a dictionary where the keys are the variable names and the values are dictionaries - defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` - would select the "temperature" variable and only the levels 1000, 950, and 900. dim_mapping: Dict[str, DimMapping] Mapping of the variables and dimensions in the input dataset to the dimensions of the output variable (`target_output_variable`). The key is the name of the output dimension to map to @@ -170,12 +165,19 @@ class InputDataset: (e.g. two datasets that coincide in space and time will only differ in the feature dimension, so the two will be combined by concatenating along the feature dimension). If a single shared coordinate cannot be found then an exception will be raised. + variables: Union[List[str], Dict[str, Dict[str, ValueSelection]]] + List of the variables to select from the dataset. E.g. `["temperature", "precipitation"]` + or a dictionary where the keys are the variable names and the values are dictionaries + defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` + would select the "temperature" variable and only the levels 1000, 950, and 900. derived_variables: Dict[str, DerivedVariable] Dictionary of variables to derive from the dataset, where the keys are the variable names and the values are dictionaries defining the necessary function and kwargs. E.g. `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which takes `time`, `lat` and `lon` as arguments. + attributes: Dict[str, Any] + Optional dictionary with dataset attributes. """ path: str From e907a6ddedf523a08f31423453ee1e341f6a13cf Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:10:30 +0000 Subject: [PATCH 49/68] Correct types in '_check_attributes' docstring --- mllam_data_prep/derived_variables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 1b275fd..f693f64 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -212,7 +212,7 @@ def _check_attributes(field, field_attributes): Parameters ---------- - field: Union[xr.DataArray, Tuple[xr.DataArray]] + field: xr.DataArray The derived field field_attributes: Dict[str, str] Dictionary with attributes for the derived variables. @@ -220,7 +220,7 @@ def _check_attributes(field, field_attributes): Returns ------- - field: Union[xr.DataArray, Tuple[xr.DataArray]] + field: xr.DataArray The derived field """ for attribute in ["units", "long_name"]: From bb9be1375bcce3cbdc0ef658efee8660efbe788c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:15:24 +0000 Subject: [PATCH 50/68] Use 'rpartition' to get 'module_name' and 'function_name' --- mllam_data_prep/derived_variables.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index f693f64..4861324 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -172,13 +172,7 @@ def _get_derived_variable_function(function_namespace): calling_module = globals()["__name__"] # Get module and function names - function_namespace_list = function_namespace.rsplit(".") - if len(function_namespace_list) > 1: - function_name = function_namespace_list[-1] - module_name = ".".join(elem for elem in function_namespace_list[:-1]) - else: - module_name = "" - function_name = function_namespace_list[0] + module_name, _, function_name = function_namespace.rpartition(".") # Check if the module_name is pointing to here (the calling module or empty "") # If it does, then use globals() to get the function otherwise import the From 49de0b3a81f261c8d87dea27c314c74290d0bb86 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 13 Dec 2024 14:23:57 +0000 Subject: [PATCH 51/68] Add some initial tests for 'derived_variables' --- tests/test_derived_variables.py | 117 ++++++++++++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 tests/test_derived_variables.py diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py new file mode 100644 index 0000000..70a9810 --- /dev/null +++ b/tests/test_derived_variables.py @@ -0,0 +1,117 @@ +import datetime +import random +from unittest.mock import patch + +import isodate +import numpy as np +import pandas as pd +import pytest +import xarray as xr + +import mllam_data_prep as mdp + +NCOORD = 10 +NTIME = 10 +LAT_MIN = -90 +LAT_MAX = 90 +LON_MIN = 0 +LON_MAX = 360 +LATITUDE = [ + 55.711, + xr.DataArray( + np.random.uniform(LAT_MIN, LAT_MAX, size=(NCOORD, NCOORD)), + dims=["x", "y"], + coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, + name="lat", + ), +] +LONGITUDE = [ + 12.564, + xr.DataArray( + np.random.uniform(LON_MIN, LON_MAX, size=(NCOORD, NCOORD)), + dims=["x", "y"], + coords={"x": np.arange(NCOORD), "y": np.arange(NCOORD)}, + name="lon", + ), +] +TIME = [ + np.datetime64("2004-06-11T00:00:00"), # invalid type + isodate.parse_datetime("1999-03-21T00:00"), + xr.DataArray( + pd.date_range( + start=isodate.parse_datetime("1999-03-21T00:00"), + periods=NTIME, + freq=isodate.parse_duration("PT1H"), + ), + dims=["time"], + name="time", + ), +] + + +def mock_cyclic_encoding(data, data_max): + """Mock the `cyclic_encoding` function from mllam_data_prep.derived_variables.""" + if isinstance(data, xr.DataArray): + data_cos = xr.DataArray( + random.uniform(-1, 1), + coords=data.coords, + dims=data.dims, + ) + data_sin = xr.DataArray( + random.uniform(-1, 1), + coords=data.coords, + dims=data.dims, + ) + return data_cos, data_sin + elif isinstance(data, (float, int)): + return random.uniform(-1, 1), random.uniform(-1, 1) + + +@pytest.mark.parametrize("lat", LATITUDE) +@pytest.mark.parametrize("lon", LONGITUDE) +@pytest.mark.parametrize("time", TIME) +def test_toa_radiation(lat, lon, time): + """ + Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + + +@pytest.mark.parametrize("time", TIME) +def test_hour_of_day(time): + """ + Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_hour_of_day(time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_hour_of_day(time) + + +@pytest.mark.parametrize("time", TIME) +def test_day_of_year(time): + """ + Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables + """ + with patch( + "mllam_data_prep.derived_variables.cyclic_encoding", + side_effect=mock_cyclic_encoding, + ): + if isinstance(time, (xr.DataArray, datetime.datetime)): + mdp.derived_variables.calculate_day_of_year(time) + else: + with pytest.raises(TypeError): + mdp.derived_variables.calculate_day_of_year(time) From b268f01b7e099eafaaf817caf86010b5c5ce70c0 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:22:47 +0000 Subject: [PATCH 52/68] Update docstrings and rename 'DerivedVariable.attributes' to 'DerivedVariable.attrs' --- mllam_data_prep/config.py | 17 +++++++++-------- mllam_data_prep/derived_variables.py | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index 2bc42a9..bfd20e9 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -57,17 +57,21 @@ class DerivedVariable: """ Defines a derived variables, where the kwargs (variables required for the calculation) and the function (for calculating the variable) - are specified. + are specified. Optionally, in case a function does not return an + `xr.DataArray` with the required attributes (`units` and `long_name`) set, + these should be specified in `attrs`, e.g. + {"attrs": "units": "W*m**-2, "long_name": "top-of-the-atmosphere radiation"}. + Additional attributes can also be set if desired. Attributes: kwargs: Variables required for calculating the derived variable. function: Function used to calculate the derived variable. - attributes: Attributes (e.g. `units` and `long_name`) for the derived variable. + attrs: Attributes (e.g. `units` and `long_name`) to set for the derived variable. """ kwargs: Dict[str, str] function: str - attributes: Optional[Dict[str, str]] = field(default_factory=dict) + attrs: Optional[Dict[str, str]] = field(default_factory=dict) @dataclass @@ -171,11 +175,8 @@ class InputDataset: defining the selection for each variable. E.g. `{"temperature": levels: {"values": [1000, 950, 900]}}` would select the "temperature" variable and only the levels 1000, 950, and 900. derived_variables: Dict[str, DerivedVariable] - Dictionary of variables to derive from the dataset, where the keys are the variable names and - the values are dictionaries defining the necessary function and kwargs. E.g. - `{"toa_radiation": {"kwargs": {"time": "time", "lat": "lat", "lon": "lon"}, "function": "calculate_toa_radiation"}}` - would derive the "toa_radiation" variable using the `calculate_toa_radiation` function, which - takes `time`, `lat` and `lon` as arguments. + Dictionary of variables to derive from the dataset, where the keys are the names variables will be given and + the values are `DerivedVariable` definitions that specify how to derive a variable. attributes: Dict[str, Any] Optional dictionary with dataset attributes. """ diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/derived_variables.py index 4861324..80d6cae 100644 --- a/mllam_data_prep/derived_variables.py +++ b/mllam_data_prep/derived_variables.py @@ -34,7 +34,7 @@ def derive_variables(ds, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attributes + derived_variable_attributes = derived_variable.attrs # Separate the lat,lon from the required variables as these will be derived separately logger.warning( From dbd5bfd5fbd5f3473e58ac9cc477478ed78f8c7b Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:29:19 +0000 Subject: [PATCH 53/68] Do not add 'attributes' to docstring --- mllam_data_prep/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index bfd20e9..f114f60 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -177,8 +177,6 @@ class InputDataset: derived_variables: Dict[str, DerivedVariable] Dictionary of variables to derive from the dataset, where the keys are the names variables will be given and the values are `DerivedVariable` definitions that specify how to derive a variable. - attributes: Dict[str, Any] - Optional dictionary with dataset attributes. """ path: str From 474a83db1ac56828f66701649bbc8c70a1d4b1ee Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:31:46 +0000 Subject: [PATCH 54/68] Remove unnecessary exception handling --- mllam_data_prep/create_dataset.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 181292a..bd53cd2 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -146,28 +146,17 @@ def create_dataset(config: Config): if variables: logger.info(f"Subsetting dataset {dataset_name}") - try: - ds = subset_dataset( - ds=ds_source, variables=variables, chunking=chunking_config - ) - except Exception as ex: - raise Exception( - f"Error subsetting dataset {dataset_name} from {path}" - ) from ex + ds = subset_dataset( + ds=ds_source, variables=variables, chunking=chunking_config + ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") - try: - ds = derive_variables( - ds=ds_source, - derived_variables=derived_variables, - chunking=chunking_config, - ) - except Exception as ex: - raise Exception( - f"Error deriving variables '{', '.join(list(derived_variables.keys()))}'" - f" from dataset {dataset_name} from {path}" - ) from ex + ds = derive_variables( + ds=ds_source, + derived_variables=derived_variables, + chunking=chunking_config, + ) _check_dataset_attributes( ds=ds, From 1da66e2d3b9a6e572be31ddc16a365963940d636 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:44:03 +0000 Subject: [PATCH 55/68] Move 'subset_dataset' to 'ops.subsetting' --- mllam_data_prep/create_dataset.py | 9 +++-- mllam_data_prep/ops/loading.py | 61 +------------------------------ mllam_data_prep/ops/subsetting.py | 60 ++++++++++++++++++++++++++++++ 3 files changed, 66 insertions(+), 64 deletions(-) create mode 100644 mllam_data_prep/ops/subsetting.py diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index bd53cd2..a034eaa 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -11,10 +11,11 @@ from . import __version__ from .config import Config, InvalidConfigException from .derived_variables import derive_variables -from .ops.loading import load_dataset, subset_dataset +from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs from .ops.statistics import calc_stats +from .ops.subsetting import subset_dataset # the `extra` field in the config that was added between v0.2.0 and v0.5.0 is # optional, so we can support both v0.2.0 and v0.5.0 @@ -140,20 +141,20 @@ def create_dataset(config: Config): logger.info(f"Loading dataset {dataset_name} from {path}") try: - ds_source = load_dataset(fp=path) + ds_input = load_input_dataset(fp=path) except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex if variables: logger.info(f"Subsetting dataset {dataset_name}") ds = subset_dataset( - ds=ds_source, variables=variables, chunking=chunking_config + ds=ds_input, variables=variables, chunking=chunking_config ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") ds = derive_variables( - ds=ds_source, + ds=ds_input, derived_variables=derived_variables, chunking=chunking_config, ) diff --git a/mllam_data_prep/ops/loading.py b/mllam_data_prep/ops/loading.py index a8c2d24..f6bfc34 100644 --- a/mllam_data_prep/ops/loading.py +++ b/mllam_data_prep/ops/loading.py @@ -1,7 +1,7 @@ import xarray as xr -def load_dataset(fp): +def load_input_dataset(fp): """ Load the dataset @@ -23,62 +23,3 @@ def load_dataset(fp): ds = xr.open_dataset(fp) return ds - - -def subset_dataset(ds, variables, chunking): - """ - Load the dataset, subset the variables along the specified coordinates and - check coordinate units - - Parameters - ---------- - ds : xr.Dataset - Source dataset - variables : dict - Dictionary with the variables to subset - with keys as the variable names and values with entries for each - coordinate and coordinate values to extract - chunking: dict - Dictionary with keys as the dimensions to chunk along and values - with the chunk size - """ - - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) - if isinstance(variables, dict): - for var, coords_to_sample in variables.items(): - da = ds[var] - for coord, sampling in coords_to_sample.items(): - coord_values = sampling.values - try: - da = da.sel(**{coord: coord_values}) - except KeyError as ex: - raise KeyError( - f"Could not find the all coordinate values `{coord_values}` in " - f"coordinate `{coord}` in the dataset" - ) from ex - expected_units = sampling.units - coord_units = da[coord].attrs.get("units", None) - if coord_units is not None and coord_units != expected_units: - raise ValueError( - f"Expected units {expected_units} for coordinate {coord}" - f" in variable {var} but got {coord_units}" - ) - ds_subset[var] = da - elif isinstance(variables, list): - try: - ds_subset = ds[variables] - except KeyError as ex: - raise KeyError( - f"Could not find the all variables `{variables}` in the dataset. " - f"The available variables are {list(ds.data_vars)}" - ) from ex - else: - raise ValueError("The `variables` argument should be a list or a dictionary") - - chunks = { - dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims - } - ds_subset = ds_subset.chunk(chunks) - - return ds_subset diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py new file mode 100644 index 0000000..8cfa8ca --- /dev/null +++ b/mllam_data_prep/ops/subsetting.py @@ -0,0 +1,60 @@ +import xarray as xr + + +def subset_dataset(ds, variables, chunking): + """ + Select specific variables from the provided the dataset, subset the + variables along the specified coordinates and check coordinate units + + Parameters + ---------- + ds : xr.Dataset + Source dataset + variables : dict + Dictionary with the variables to subset + with keys as the variable names and values with entries for each + coordinate and coordinate values to extract + chunking: dict + Dictionary with keys as the dimensions to chunk along and values + with the chunk size + """ + + ds_subset = xr.Dataset() + ds_subset.attrs.update(ds.attrs) + if isinstance(variables, dict): + for var, coords_to_sample in variables.items(): + da = ds[var] + for coord, sampling in coords_to_sample.items(): + coord_values = sampling.values + try: + da = da.sel(**{coord: coord_values}) + except KeyError as ex: + raise KeyError( + f"Could not find the all coordinate values `{coord_values}` in " + f"coordinate `{coord}` in the dataset" + ) from ex + expected_units = sampling.units + coord_units = da[coord].attrs.get("units", None) + if coord_units is not None and coord_units != expected_units: + raise ValueError( + f"Expected units {expected_units} for coordinate {coord}" + f" in variable {var} but got {coord_units}" + ) + ds_subset[var] = da + elif isinstance(variables, list): + try: + ds_subset = ds[variables] + except KeyError as ex: + raise KeyError( + f"Could not find the all variables `{variables}` in the dataset. " + f"The available variables are {list(ds.data_vars)}" + ) from ex + else: + raise ValueError("The `variables` argument should be a list or a dictionary") + + chunks = { + dim: chunking.get(dim, int(ds_subset[dim].count())) for dim in ds_subset.dims + } + ds_subset = ds_subset.chunk(chunks) + + return ds_subset From dc7dc5e04ade8d63b6923cbc4d5b4b9303d760be Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:48:26 +0000 Subject: [PATCH 56/68] Move 'derived_variables' to 'ops' --- example.danra.yaml | 4 ++-- mllam_data_prep/create_dataset.py | 2 +- mllam_data_prep/{ => ops}/derived_variables.py | 0 3 files changed, 3 insertions(+), 3 deletions(-) rename mllam_data_prep/{ => ops}/derived_variables.py (100%) diff --git a/example.danra.yaml b/example.danra.yaml index f1fa443..d6a9468 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -82,11 +82,11 @@ inputs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation + function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation hour_of_day: kwargs: time: time - function: mllam_data_prep.derived_variables.calculate_hour_of_day + function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index a034eaa..19bf4df 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -10,7 +10,7 @@ from . import __version__ from .config import Config, InvalidConfigException -from .derived_variables import derive_variables +from .ops.derived_variables import derive_variables from .ops.loading import load_input_dataset from .ops.mapping import map_dims_and_variables from .ops.selection import select_by_kwargs diff --git a/mllam_data_prep/derived_variables.py b/mllam_data_prep/ops/derived_variables.py similarity index 100% rename from mllam_data_prep/derived_variables.py rename to mllam_data_prep/ops/derived_variables.py From c9e96af9388a2fa9d75be2c6367b24d6ba399f6c Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 13:59:02 +0000 Subject: [PATCH 57/68] Move chunk size check to 'chunking' module --- mllam_data_prep/ops/chunking.py | 44 ++++++++++++++++++++++++ mllam_data_prep/ops/derived_variables.py | 26 +++----------- 2 files changed, 48 insertions(+), 22 deletions(-) create mode 100644 mllam_data_prep/ops/chunking.py diff --git a/mllam_data_prep/ops/chunking.py b/mllam_data_prep/ops/chunking.py new file mode 100644 index 0000000..12731e1 --- /dev/null +++ b/mllam_data_prep/ops/chunking.py @@ -0,0 +1,44 @@ +import numpy as np +from loguru import logger + +# Max chunk size warning +CHUNK_MAX_SIZE_WARNING = 1 * 1024**3 # 1GB + + +def check_chunk_size(ds, chunks): + """ + Check the chunk size and warn if it exceed CHUNK_MAX_SIZE_WARNING. + + Parameters + ---------- + ds: xr.Dataset + Dataset to be chunked + chunks: Dict[str, int] + Dictionary with keys as dimensions to be chunked and + chunk sizes as the values + + Returns + ------- + ds: xr.Dataset + Dataset with chunking applied + """ + + # Check the chunk size + for var_name, var_data in ds.data_vars.items(): + total_size = 1 + + for dim, chunk_size in chunks.items(): + dim_size = ds.sizes.get(dim, None) + if dim_size is None: + raise KeyError(f"Dimension '{dim}' not found in the dataset.") + total_size *= chunk_size + + dtype = var_data.dtype + bytes_per_element = np.dtype(dtype).itemsize + + memory_usage = total_size * bytes_per_element + + if memory_usage > CHUNK_MAX_SIZE_WARNING: + logger.warning( + f"The chunk size for '{var_name}' exceeds '{CHUNK_MAX_SIZE_WARNING}' GB." + ) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 80d6cae..f31865f 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -6,6 +6,8 @@ import xarray as xr from loguru import logger +from .chunking import check_chunk_size + def derive_variables(ds, derived_variables, chunking): """ @@ -105,7 +107,7 @@ def derive_variables(ds, derived_variables, chunking): def _chunk_dataset(ds, chunks): """ - Chunk dataset and check the chunk size. + Check the chunk size and chunk dataset. Parameters ---------- @@ -120,28 +122,8 @@ def _chunk_dataset(ds, chunks): ds: xr.Dataset Dataset with chunking applied """ - # Define the memory limit check - memory_limit_check = 1 * 1024**3 # 1 GB - # Check the chunk size - for var_name, var_data in ds.data_vars.items(): - total_size = 1 - - for dim, chunk_size in chunks.items(): - dim_size = ds.sizes.get(dim, None) - if dim_size is None: - raise KeyError(f"Dimension '{dim}' not found in the dataset.") - total_size *= chunk_size - - dtype = var_data.dtype - bytes_per_element = np.dtype(dtype).itemsize - - memory_usage = total_size * bytes_per_element - - if memory_usage > memory_limit_check: - logger.warning( - f"The chunk size for '{var_name}' exceeds '{memory_limit_check}' GB." - ) + check_chunk_size(ds, chunks) # Try chunking try: From 47b8411b0b23c026d0f64260384d81c2d5e2b700 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 14:46:28 +0000 Subject: [PATCH 58/68] Add module docstring --- mllam_data_prep/ops/derived_variables.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index f31865f..9ad495e 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -1,3 +1,11 @@ +""" +Handle deriving new variables (xr.DataArrays) from an individual input dataset +that has been loaded. This makes it possible to for example add fields that can +be derived from analytical expressions and are functions of coordinate values +(e.g. top-of-atmosphere incoming radiation is a function of time and lat/lon location), +but also of other physical fields (wind-speed is a function of both meridional +and zonal wind components). +""" import datetime import importlib import sys From 5ae772f736e773d5d1beef8db684e19e09366ea7 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Tue, 17 Dec 2024 14:54:07 +0000 Subject: [PATCH 59/68] Update tests --- tests/test_derived_variables.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/tests/test_derived_variables.py b/tests/test_derived_variables.py index 70a9810..786e064 100644 --- a/tests/test_derived_variables.py +++ b/tests/test_derived_variables.py @@ -50,7 +50,7 @@ def mock_cyclic_encoding(data, data_max): - """Mock the `cyclic_encoding` function from mllam_data_prep.derived_variables.""" + """Mock the `cyclic_encoding` function from mllam_data_prep.ops.derived_variables.""" if isinstance(data, xr.DataArray): data_cos = xr.DataArray( random.uniform(-1, 1), @@ -75,14 +75,14 @@ def test_toa_radiation(lat, lon, time): Test the `calculate_toa_radiation` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_toa_radiation(lat, lon, time) + mdp.ops.derived_variables.calculate_toa_radiation(lat, lon, time) @pytest.mark.parametrize("time", TIME) @@ -91,14 +91,14 @@ def test_hour_of_day(time): Test the `calculate_hour_of_day` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_hour_of_day(time) + mdp.ops.derived_variables.calculate_hour_of_day(time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_hour_of_day(time) + mdp.ops.derived_variables.calculate_hour_of_day(time) @pytest.mark.parametrize("time", TIME) @@ -107,11 +107,11 @@ def test_day_of_year(time): Test the `calculate_day_of_year` function from mllam_data_prep.derived_variables """ with patch( - "mllam_data_prep.derived_variables.cyclic_encoding", + "mllam_data_prep.ops.derived_variables.cyclic_encoding", side_effect=mock_cyclic_encoding, ): if isinstance(time, (xr.DataArray, datetime.datetime)): - mdp.derived_variables.calculate_day_of_year(time) + mdp.ops.derived_variables.calculate_day_of_year(time) else: with pytest.raises(TypeError): - mdp.derived_variables.calculate_day_of_year(time) + mdp.ops.derived_variables.calculate_day_of_year(time) From 2c0bdf879df0fbfe428e1eb1318267fe5a4d4713 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:00:08 +0000 Subject: [PATCH 60/68] Add global REQUIRED_FIELD_ATTRIBUTES var and updated check for required attributes --- mllam_data_prep/ops/derived_variables.py | 53 ++++++++++++++---------- 1 file changed, 30 insertions(+), 23 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 9ad495e..41db614 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -16,6 +16,8 @@ from .chunking import check_chunk_size +REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] + def derive_variables(ds, derived_variables, chunking): """ @@ -44,7 +46,7 @@ def derive_variables(ds, derived_variables, chunking): for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function - derived_variable_attributes = derived_variable.attrs + expected_field_attributes = derived_variable.attrs # Separate the lat,lon from the required variables as these will be derived separately logger.warning( @@ -87,17 +89,18 @@ def derive_variables(ds, derived_variables, chunking): # Calculate the derived variable derived_field = func(**kwargs) - # Check the derived field(s) and add it to the dataset + # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) + # set and add it to the dataset if isinstance(derived_field, xr.DataArray): - derived_field = _check_attributes( - derived_field, derived_variable_attributes + derived_field = _check_for_required_attributes( + derived_field, expected_field_attributes ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: - field = _check_attributes(field, derived_variable_attributes) + field = _check_for_required_attributes(field, expected_field_attributes) ds_derived_vars[field.name] = field else: raise TypeError( @@ -190,7 +193,7 @@ def _get_derived_variable_function(function_namespace): return function -def _check_attributes(field, field_attributes): +def _check_for_required_attributes(field, expected_attributes): """ Check the attributes of the derived variable. @@ -198,8 +201,8 @@ def _check_attributes(field, field_attributes): ---------- field: xr.DataArray The derived field - field_attributes: Dict[str, str] - Dictionary with attributes for the derived variables. + expected_attributes: Dict[str, str] + Dictionary with expected attributes for the derived variables. Defined in the config file. Returns @@ -207,32 +210,36 @@ def _check_attributes(field, field_attributes): field: xr.DataArray The derived field """ - for attribute in ["units", "long_name"]: + for attribute in REQUIRED_FIELD_ATTRIBUTES: if attribute not in field.attrs or field.attrs[attribute] is None: - if attribute in field_attributes.keys(): - field.attrs[attribute] = field_attributes[attribute] + if attribute in expected_attributes.keys(): + field.attrs[attribute] = expected_attributes[attribute] else: # The expected attributes are empty and the attributes have not been # set during the calculation of the derived variable - raise ValueError( - f"The attribute '{attribute}' has not been set for the derived" - f" variable '{field.name}' (most likely because you are using a" - " function external to `mlllam-data-prep` to derive the field)." - " This attribute has not been defined in the 'attributes' section" - " of the config file either. Make sure that you add it to the" - f" 'attributes' section of the derived variable '{field.name}'." + raise KeyError( + f'The attribute "{attribute}" has not been set for the derived' + f' variable "{field.name}". This is most likely because you are' + " using a function external to `mlllam-data-prep` to derive the field," + f" in which the required attributes ({', '.join(REQUIRED_FIELD_ATTRIBUTES)})" + " are not set. If they are not set in the function call when deriving the field," + ' they can be set in the config file by adding an "attrs" section under the' + f' "{field.name}" derived variable section. For example, if the required attributes' + f" ({', '.join(REQUIRED_FIELD_ATTRIBUTES)}) are not set for a derived variable named" + f' "toa_radiation" they can be set by adding the following to the config file:' + ' {"attrs": {"units": "W*m**-2", "long_name": "top-of-atmosphere incoming radiation"}}.' ) - elif attribute in field_attributes.keys(): + elif attribute in expected_attributes.keys(): logger.warning( f"The attribute '{attribute}' of the derived field" f" {field.name} is being overwritten from" f" '{field.attrs[attribute]}' to" - f" '{field_attributes[attribute]}' according" - " to specification in the config file." + f" '{expected_attributes[attribute]}' according" + " to the specification in the config file." ) - field.attrs[attribute] = field_attributes[attribute] + field.attrs[attribute] = expected_attributes[attribute] else: - # Attributes are set and nothing has been defined in the config file + # Attributes are set in the funciton and nothing has been defined in the config file pass return field From f1ce6d196e9662663593b5933afcf188cee0e911 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:01:57 +0000 Subject: [PATCH 61/68] Update long name for toa_radiation --- mllam_data_prep/ops/derived_variables.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 41db614..8dafcd0 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -277,7 +277,7 @@ def _return_dropped_coordinates( def calculate_toa_radiation(lat, lon, time): """ - Function for calculating top-of-the-atmosphere radiation + Function for calculating top-of-atmosphere incoming radiation Parameters ---------- @@ -291,9 +291,9 @@ def calculate_toa_radiation(lat, lon, time): Returns ------- toa_radiation : Union[xr.DataArray, float] - TOA radiation data + Top-of-atmosphere incoming radiation """ - logger.info("Calculating top-of-atmosphere radiation") + logger.info("Calculating top-of-atmosphere incoming radiation") # Solar constant solar_constant = 1366 # W*m**-2 @@ -331,7 +331,7 @@ def calculate_toa_radiation(lat, lon, time): if isinstance(toa_radiation, xr.DataArray): # Add attributes toa_radiation.name = "toa_radiation" - toa_radiation.attrs["long_name"] = "top-of-the-atmosphere radiation" + toa_radiation.attrs["long_name"] = "top-of-atmosphere incoming radiation" toa_radiation.attrs["units"] = "W*m**-2" return toa_radiation From 58d8af6cbcce45b0bce5416c72249f04c5a6b405 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Wed, 18 Dec 2024 09:03:11 +0000 Subject: [PATCH 62/68] Update README --- README.md | 37 +++++++++++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index de089dc..fcb903d 100644 --- a/README.md +++ b/README.md @@ -371,10 +371,39 @@ The `inputs` section defines the source datasets to extract data from. Each sour - `rename`: simply rename the dimension to the new name - `stack`: stack the listed dimension to create the dimension in the output - `stack_variables_by_var_name`: stack the dimension into the new dimension, and also stack the variable name into the new variable name. This is useful when you have multiple variables with the same dimensions that you want to stack into a single variable. -- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. - - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.derived_variables` module it is enough with the function name only. - - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the variable name to select from the source dataset and each value is the named argument to `function`. - - `attributes`: section where users can specify attributes (e.g. `units` and `long_name`) as a dictionary (not included in the example config file), where the keys are the attribute names and the values are strings. If using a function defined in `mllam_data_prep.derived_variables` this section is optional as the attributes should already be defined. In this case, adding the attributes to the config file will overwrite the already-defined ones. If using an external function, where the attributes `units` and `long_name` are not set, this section is a requirement. +- `derived_variables`: defines the variables to be derived from the variables available in the source dataset. This should be a dictionary where each key is the variable to be derived and the value defines a dictionary with the following additional information. See the 'Derived Variables' section for more details. + - `function`: the function to be used to derive a variable. This should be a string and may either be the full namespace of the function (e.g. `mllam_data_prep.ops.derived_variables.calculate_toa_radiation`) or in case the function is included in the `mllam_data_prep.ops.derived_variables` module it is enough with the function name only. + - `kwargs`: arguments for the function used to derive a variable. This is a dictionary where each key is the name of the variables to select from the source dataset and each value is the named argument to `function`. + +#### Derived Variables +Variables that are not part of the source dataset but can be derived from variables in the source dataset can also be included. They should be defined in their own section, called `derived_variables` as illustrated in the example config above and in the `example.danra.yaml` config file. + +To derive the variables, the function to be used to derive the variable (`function`) and the arguments to this function (`kwargs`) need to be specified, as explained above. In addition, an optional section called `attrs` can be added. In this section, the user can add attributes to the derived variable, as illustrated below. +```yaml + derived_variables: + toa_radiation: + kwargs: + time: time + lat: lat + lon: lon + function: mllam_data_prep.derived_variables.calculate_toa_radiation + attrs: + units: W*m**-2 + long_name: top-of-atmosphere incoming radiation +``` + +Note that the attributes `units` and `long_name` are required. This means that if the function used to derive a variable does not set these attributes they are **required** to be set in the config file. If using a function defined in `mllam_data_prep.ops.derived_variables` the `attrs` section is optional as the attributes should already be defined. In this case, adding the `units` and `long_name` attributes to the `attrs` section of the derived variable in config file will overwrite the already-defined attributes from the function. + +Currently, the following derived variables are included as part of `mllam-data-prep`: +- `toa_radiation`: + - Top-of-atmosphere incoming radiation + - function: `mllam_data_prep.ops.derived_variables.calculate_toa_radiation` +- `hour_of_day`: + - Hour of day (cyclically encoded) + - function: `mllam_data_prep.ops.derived_variables.calculate_hour_of_day` +- `day_of_year`: + - Day of year (cyclically encoded) + - function: `mllam_data_prep.ops.derived_variables.calculate_day_of_year` ### Config schema versioning From f87b95438f7452e0757dc5e65699e7190962d2b8 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 07:23:47 +0000 Subject: [PATCH 63/68] Return dropped coordinates to the data-arrays instead --- mllam_data_prep/ops/derived_variables.py | 27 ++++++++++++------------ 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 8dafcd0..2e30f12 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -95,12 +95,18 @@ def derive_variables(ds, derived_variables, chunking): derived_field = _check_for_required_attributes( derived_field, expected_field_attributes ) + derived_field = _return_dropped_coordinates( + derived_field, ds_input, required_coordinates, chunks + ) ds_derived_vars[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_for_required_attributes(field, expected_field_attributes) + field = _return_dropped_coordinates( + field, ds_input, required_coordinates, chunks + ) ds_derived_vars[field.name] = field else: raise TypeError( @@ -108,11 +114,6 @@ def derive_variables(ds, derived_variables, chunking): f" but got {type(derived_field)}." ) - # Add back dropped coordinates - ds_derived_vars = _return_dropped_coordinates( - ds_derived_vars, ds_input, required_coordinates, chunks - ) - return ds_derived_vars @@ -245,16 +246,14 @@ def _check_for_required_attributes(field, expected_attributes): return field -def _return_dropped_coordinates( - ds_derived_vars, ds_input, required_coordinates, chunks -): +def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): """ - Return the coordinates that have been reset. + Return the coordinates that have been dropped/reset. Parameters ---------- - ds_derived_vars: xr.Dataset - Dataset with derived variables + derived_field: xr.Dataset + Derived variable ds_input: xr.Dataset Input dataset for deriving variables required_coordinates: List[str] @@ -265,14 +264,14 @@ def _return_dropped_coordinates( Returns ------- - ds_derived_vars: xr.Dataset + derived_field: xr.Dataset Dataset with derived variables, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - ds_derived_vars.coords[req_coord] = ds_input[req_coord] + derived_field.coords[req_coord] = ds_input[req_coord] - return ds_derived_vars + return derived_field def calculate_toa_radiation(lat, lon, time): From 80cf058440b421747e8745c0b95b7feea405aff2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 07:27:42 +0000 Subject: [PATCH 64/68] Adds dims to the dataset to make it work with derived variables that doesn't have all dimensions. This way we don't need to broadcast these variables explicitly to all dimensions. --- mllam_data_prep/ops/derived_variables.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 2e30f12..9876e42 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -43,6 +43,10 @@ def derive_variables(ds, derived_variables, chunking): ds_derived_vars = xr.Dataset() ds_derived_vars.attrs.update(ds.attrs) + # Add dimensions to the new dataset + for dim in ds.dims: + ds_derived_vars = ds_derived_vars.assign_coords({dim: ds.coords[dim]}) + for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs function_name = derived_variable.function From da0c171245b12b6aae4df10af33b5d97317d4a71 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 09:12:43 +0000 Subject: [PATCH 65/68] Add ability to have 'variables' and 'derived_variables' in the same dataset - Output dataset is created in 'create_dataset' instead of in the 'subset_dataset' and 'derive_variables' functions. - Rename dataset variables to make it clearer what they are and also make them more consistent between 'subset_dataset' and 'derive_variables'. - Add function for aligning the derived variables to the correct output dimensions. - Move the 'derived_variables' from their own dataset in the example config file to the 'danra_surface' dataset, as it is now possible to combine them. --- example.danra.yaml | 16 +--- mllam_data_prep/create_dataset.py | 14 +++- mllam_data_prep/ops/derived_variables.py | 97 +++++++++++++++++------- mllam_data_prep/ops/subsetting.py | 19 ++--- 4 files changed, 89 insertions(+), 57 deletions(-) diff --git a/example.danra.yaml b/example.danra.yaml index d6a9468..30682ff 100644 --- a/example.danra.yaml +++ b/example.danra.yaml @@ -61,22 +61,8 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 19bf4df..698aed9 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -145,16 +145,26 @@ def create_dataset(config: Config): except Exception as ex: raise Exception(f"Error loading dataset {dataset_name} from {path}") from ex + # Initialize the output dataset and add dimensions + ds = xr.Dataset() + ds.attrs.update(ds_input.attrs) + for dim in ds_input.dims: + ds = ds.assign_coords({dim: ds_input.coords[dim]}) + if variables: logger.info(f"Subsetting dataset {dataset_name}") ds = subset_dataset( - ds=ds_input, variables=variables, chunking=chunking_config + ds_subset=ds, + ds_input=ds_input, + variables=variables, + chunking=chunking_config, ) if derived_variables: logger.info(f"Deriving variables from {dataset_name}") ds = derive_variables( - ds=ds_input, + ds=ds, + ds_input=ds_input, derived_variables=derived_variables, chunking=chunking_config, ) diff --git a/mllam_data_prep/ops/derived_variables.py b/mllam_data_prep/ops/derived_variables.py index 9876e42..7502deb 100644 --- a/mllam_data_prep/ops/derived_variables.py +++ b/mllam_data_prep/ops/derived_variables.py @@ -19,14 +19,16 @@ REQUIRED_FIELD_ATTRIBUTES = ["units", "long_name"] -def derive_variables(ds, derived_variables, chunking): +def derive_variables(ds, ds_input, derived_variables, chunking): """ Load the dataset, and derive the specified variables Parameters --------- ds : xr.Dataset - Source dataset + Output dataset + ds_input : xr.Dataset + Input/source dataset derived_variables : Dict[str, DerivedVariable] Dictionary with the variables to derive with keys as the variable names and values with entries for kwargs and function to use in @@ -41,11 +43,7 @@ def derive_variables(ds, derived_variables, chunking): Dataset with derived variables included """ - ds_derived_vars = xr.Dataset() - ds_derived_vars.attrs.update(ds.attrs) - # Add dimensions to the new dataset - for dim in ds.dims: - ds_derived_vars = ds_derived_vars.assign_coords({dim: ds.coords[dim]}) + target_dims = list(ds_input.sizes.keys()) for _, derived_variable in derived_variables.items(): required_kwargs = derived_variable.kwargs @@ -62,63 +60,69 @@ def derive_variables(ds, derived_variables, chunking): if key in ["lat", "lon"]: latlon_coords_to_include[key] = required_kwargs.pop(key) - # Get input dataset for calculating derived variables - ds_input = ds[required_kwargs.keys()] + # Get subset of input dataset for calculating derived variables + ds_subset = ds_input[required_kwargs.keys()] # Any coordinates needed for the derivation, for which chunking should be performed, # should be converted to variables since it is not possible for *indexed* coordinates # to be chunked dask arrays chunks = { - dim: chunking.get(dim, int(ds_input[dim].count())) for dim in ds_input.dims + dim: chunking.get(dim, int(ds_subset[dim].count())) + for dim in ds_subset.dims } required_coordinates = [ - req_var for req_var in required_kwargs.keys() if req_var in ds_input.coords + req_var for req_var in required_kwargs.keys() if req_var in ds_subset.coords ] - ds_input = ds_input.drop_indexes(required_coordinates, errors="ignore") + ds_subset = ds_subset.drop_indexes(required_coordinates, errors="ignore") for req_coord in required_coordinates: if req_coord in chunks: - ds_input = ds_input.reset_coords(req_coord) + ds_subset = ds_subset.reset_coords(req_coord) # Chunk the dataset - ds_input = _chunk_dataset(ds_input, chunks) + ds_subset = _chunk_dataset(ds_subset, chunks) # Add function arguments to kwargs kwargs = {} if len(latlon_coords_to_include): - latlon = get_latlon_coords_for_input(ds) + latlon = get_latlon_coords_for_input(ds_input) for key, val in latlon_coords_to_include.items(): kwargs[val] = latlon[key] - kwargs.update({val: ds_input[key] for key, val in required_kwargs.items()}) + kwargs.update({val: ds_subset[key] for key, val in required_kwargs.items()}) func = _get_derived_variable_function(function_name) # Calculate the derived variable derived_field = func(**kwargs) # Check that the derived field has the necessary attributes (REQUIRED_FIELD_ATTRIBUTES) - # set and add it to the dataset + # set, return any dropped/reset coordinates, align it to the output dataset dimensions + # (if necessary) and add it to the dataset if isinstance(derived_field, xr.DataArray): derived_field = _check_for_required_attributes( derived_field, expected_field_attributes ) derived_field = _return_dropped_coordinates( - derived_field, ds_input, required_coordinates, chunks + derived_field, ds_subset, required_coordinates, chunks ) - ds_derived_vars[derived_field.name] = derived_field + derived_field = _align_derived_variable( + derived_field, ds_input, target_dims + ) + ds[derived_field.name] = derived_field elif isinstance(derived_field, tuple) and all( isinstance(field, xr.DataArray) for field in derived_field ): for field in derived_field: field = _check_for_required_attributes(field, expected_field_attributes) field = _return_dropped_coordinates( - field, ds_input, required_coordinates, chunks + field, ds_subset, required_coordinates, chunks ) - ds_derived_vars[field.name] = field + field = _align_derived_variable(field, ds_input, target_dims) + ds[field.name] = field else: raise TypeError( "Expected an instance of xr.DataArray or tuple(xr.DataArray)," f" but got {type(derived_field)}." ) - return ds_derived_vars + return ds def _chunk_dataset(ds, chunks): @@ -250,7 +254,7 @@ def _check_for_required_attributes(field, expected_attributes): return field -def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, chunks): +def _return_dropped_coordinates(derived_field, ds, required_coordinates, chunks): """ Return the coordinates that have been dropped/reset. @@ -258,8 +262,8 @@ def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, c ---------- derived_field: xr.Dataset Derived variable - ds_input: xr.Dataset - Input dataset for deriving variables + ds: xr.Dataset + Dataset with required coordinatwes required_coordinates: List[str] List of coordinates required for the derived variable chunks: Dict[str, int] @@ -269,15 +273,50 @@ def _return_dropped_coordinates(derived_field, ds_input, required_coordinates, c Returns ------- derived_field: xr.Dataset - Dataset with derived variables, now also with dropped coordinates returned + Derived variable, now also with dropped coordinates returned """ for req_coord in required_coordinates: if req_coord in chunks: - derived_field.coords[req_coord] = ds_input[req_coord] + derived_field.coords[req_coord] = ds[req_coord] return derived_field +def _align_derived_variable(field, ds, target_dims): + """ + Align a derived variable to the target dimensions (ignoring non-dimension coordinates). + + Parameters + ---------- + field: xr.DataArray + Derived field to align + ds: xr.Dataset + Target dataset + target_dims: List[str] + Dimensions to align to (e.g. 'time', 'y', 'x') + + Returns + ------- + field: xr.DataArray + The derived field aligned to the target dimensions + """ + # Ensure that dimensions are ordered correctly + field = field.transpose( + *[dim for dim in target_dims if dim in field.dims], missing_dims="ignore" + ) + + # Add missing dimensions explicitly + for dim in target_dims: + if dim not in field.dims: + field = field.expand_dims({dim: ds.sizes[dim]}) + + # Broadcast to match only the target dimensions + broadcast_shape = {dim: ds[dim] for dim in target_dims if dim in ds.dims} + field = field.broadcast_like(xr.Dataset(coords=broadcast_shape)) + + return field + + def calculate_toa_radiation(lat, lon, time): """ Function for calculating top-of-atmosphere incoming radiation @@ -467,6 +506,6 @@ def cyclic_encoding(data, data_max): return data_cos, data_sin -def get_latlon_coords_for_input(ds_input): +def get_latlon_coords_for_input(ds): """Dummy function for getting lat and lon.""" - return ds_input[["lat", "lon"]].chunk(-1, -1) + return ds[["lat", "lon"]].chunk(-1, -1) diff --git a/mllam_data_prep/ops/subsetting.py b/mllam_data_prep/ops/subsetting.py index 8cfa8ca..d2ba3a8 100644 --- a/mllam_data_prep/ops/subsetting.py +++ b/mllam_data_prep/ops/subsetting.py @@ -1,15 +1,14 @@ -import xarray as xr - - -def subset_dataset(ds, variables, chunking): +def subset_dataset(ds_subset, ds_input, variables, chunking): """ Select specific variables from the provided the dataset, subset the variables along the specified coordinates and check coordinate units Parameters ---------- - ds : xr.Dataset - Source dataset + ds_subset : xr.Dataset + Subset of ds_input + ds_input : xr.Dataset + Input/source dataset variables : dict Dictionary with the variables to subset with keys as the variable names and values with entries for each @@ -19,11 +18,9 @@ def subset_dataset(ds, variables, chunking): with the chunk size """ - ds_subset = xr.Dataset() - ds_subset.attrs.update(ds.attrs) if isinstance(variables, dict): for var, coords_to_sample in variables.items(): - da = ds[var] + da = ds_input[var] for coord, sampling in coords_to_sample.items(): coord_values = sampling.values try: @@ -43,11 +40,11 @@ def subset_dataset(ds, variables, chunking): ds_subset[var] = da elif isinstance(variables, list): try: - ds_subset = ds[variables] + ds_subset = ds_input[variables] except KeyError as ex: raise KeyError( f"Could not find the all variables `{variables}` in the dataset. " - f"The available variables are {list(ds.data_vars)}" + f"The available variables are {list(ds_input.data_vars)}" ) from ex else: raise ValueError("The `variables` argument should be a list or a dictionary") From f61a3b6590cf858683e03dd5ef3cd846c4a67b2e Mon Sep 17 00:00:00 2001 From: ealerskans Date: Thu, 19 Dec 2024 09:32:37 +0000 Subject: [PATCH 66/68] Update README --- README.md | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/README.md b/README.md index fcb903d..48eb1b6 100644 --- a/README.md +++ b/README.md @@ -320,22 +320,8 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time From 554f86940e1276f4301a25458e97f1c9537275e2 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 20 Dec 2024 08:10:03 +0000 Subject: [PATCH 67/68] Add 'load_config' function, which wraps 'from_yaml_file' and checks that either 'variables' or 'derived_variables' are included and that if both are included, they don't contain the same variable names --- README.md | 2 +- mllam_data_prep/config.py | 50 ++++++++++++++++++++++++++++++- mllam_data_prep/create_dataset.py | 2 +- 3 files changed, 51 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 48eb1b6..b95d186 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,7 @@ The package can also be used as a python module to create datasets directly, for import mllam_data_prep as mdp config_path = "example.danra.yaml" -config = mdp.Config.from_yaml_file(config_path) +config = mdp.Config.load_config(config_path) ds = mdp.create_dataset(config=config) ``` diff --git a/mllam_data_prep/config.py b/mllam_data_prep/config.py index f114f60..93f407b 100644 --- a/mllam_data_prep/config.py +++ b/mllam_data_prep/config.py @@ -328,6 +328,54 @@ class Config(dataclass_wizard.JSONWizard, dataclass_wizard.YAMLWizard): class _(JSONWizard.Meta): raise_on_unknown_json_key = True + @staticmethod + def load_config(*args, **kwargs): + """ + Wrapper function for `from_yaml_file` to load config file and validate that: + - either `variables` or `derived_variables` are present in the config + - if both `variables` and `derived_variables` are present, that they don't + add the same variables to the dataset + + Parameters + ---------- + *args: Positional arguments for `from_yaml_file` + **kwargs: Keyword arguments for `from_yaml_file` + + Returns + ------- + config: Config + """ + + # Load the config + config = Config.from_yaml_file(*args, **kwargs) + + for input_dataset in config.inputs.values(): + if not input_dataset.variables and not input_dataset.derived_variables: + raise InvalidConfigException( + "At least one of the keys `variables` and `derived_variables` must be included" + " in the input dataset." + ) + elif input_dataset.variables and input_dataset.derived_variables: + # Check so that there are no overlapping variables + if isinstance(input_dataset.variables, list): + variable_vars = input_dataset.variables + elif isinstance(input_dataset.variables, dict): + variable_vars = input_dataset.variables.keys() + else: + raise TypeError( + f"Expected an instance of list or dict, but got {type(input_dataset.variables)}." + ) + derived_variable_vars = input_dataset.derived_variables.keys() + common_vars = list(set(variable_vars) & set(derived_variable_vars)) + if len(common_vars) > 0: + raise InvalidConfigException( + "Both `variables` and `derived_variables` include the following variables name(s):" + f" '{', '.join(common_vars)}'. This is not allowed. Make sure that there" + " are no overlapping variable names between `variables` and `derived_variables`," + f" either by renaming or removing '{', '.join(common_vars)}' from one of them." + ) + return config + if __name__ == "__main__": import argparse @@ -338,7 +386,7 @@ class _(JSONWizard.Meta): ) args = argparser.parse_args() - config = Config.from_yaml_file(args.f) + config = Config.load_config(args.f) import rich rich.print(config) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 698aed9..93cf82d 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -286,7 +286,7 @@ def create_dataset_zarr(fp_config, fp_zarr: str = None): The path to the zarr file to write the dataset to. If not provided, the zarr file will be written to the same directory as the config file with the extension changed to '.zarr'. """ - config = Config.from_yaml_file(file=fp_config) + config = Config.load_config(file=fp_config) ds = create_dataset(config=config) From 085aae33259f507668db74f5352c59b08b281091 Mon Sep 17 00:00:00 2001 From: ealerskans Date: Fri, 20 Dec 2024 08:20:34 +0000 Subject: [PATCH 68/68] Update README --- README.md | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index b95d186..034aa60 100644 --- a/README.md +++ b/README.md @@ -175,32 +175,18 @@ inputs: variables: # use surface incoming shortwave radiation as forcing - swavr0m - dim_mapping: - time: - method: rename - dim: time - grid_index: - method: stack - dims: [x, y] - forcing_feature: - method: stack_variables_by_var_name - name_format: "{var_name}" - target_output_variable: forcing - - danra_derived_forcings: - path: https://mllam-test-data.s3.eu-north-1.amazonaws.com/single_levels.zarr - dims: [time, x, y] derived_variables: + # derive variables to be used as forcings toa_radiation: kwargs: time: time lat: lat lon: lon - function: mllam_data_prep.derived_variables.calculate_toa_radiation + function: mllam_data_prep.ops.derived_variables.calculate_toa_radiation hour_of_day: kwargs: time: time - function: mllam_data_prep.derived_variables.calculate_hour_of_day + function: mllam_data_prep.ops.derived_variables.calculate_hour_of_day dim_mapping: time: method: rename