mllam · leifdenby · Jun 25, 2024 · Jun 4, 2024 · Jun 4, 2024 · Jun 4, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [unreleased](https://github.com/mllam/mllam-data-prep/compare/v0.1.0...HEAD)
+
+- split dataset creation and storage to zarr into separate functions `mllam_data_prep.create_dataset(...)` and `mllam_data_prep.create_dataset_zarr(...)` respectively ![\#7](https://github.com/mllam/mllam-data-prep/pull/7)
+
 ## [v0.1.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.1.0)
 
 First tagged release of `mllam-data-prep` which includes functionality to

diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py
@@ -0,0 +1,2 @@
+# expose the public API
+from .create_dataset import create_dataset, create_dataset_zarr  # noqa
diff --git a/mllam_data_prep/__main__.py b/mllam_data_prep/__main__.py
@@ -2,7 +2,7 @@
 
 from dask.diagnostics import ProgressBar
 
-from .create_dataset import main
+from .create_dataset import create_dataset_zarr
 
 if __name__ == "__main__":
     import argparse
@@ -17,4 +17,4 @@
     if args.show_progress:
         ProgressBar().register()
 
-    main(fp_config=args.config)
+    create_dataset_zarr(fp_config=args.config)
diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
@@ -80,9 +80,20 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
     return ds
 
 
-def main(fp_config):
-    config = ConfigDict.load(fp_config=fp_config)
-
+def create_dataset(config: ConfigDict):
+    """
+    Create a dataset from the input datasets specified in the config file.
+
+    Parameters
+    ----------
+    config : ConfigDict
+        The configuration file.
+
+    Returns
+    -------
+    xr.Dataset
+        The dataset created from the input datasets with a variable for each target architecture variable.
+    """
     architecture_config = config["architecture"]
     architecture_input_ranges = architecture_config.get("input_range", {})
 
@@ -150,16 +161,32 @@ def main(fp_config):
         dataarrays_by_target[target_arch_var].append(da_target)
 
     ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target)
+
     # need to drop the encoding so that we can write to zarr with new chunksizes
     ds = ds.drop_encoding()
 
     # default to making a single chunk for each dimension if chunksize is not specified
     # in the config
-    config_chunking = architecture_config.get("chunking", {})
-    chunks = {d: config_chunking.get(d, int(ds[d].count())) for d in ds.dims}
+    chunking_config = config["architecture"].get("chunking", {})
+    chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}
     ds = ds.chunk(chunks)
 
-    print(ds)
+    return ds
+
+
+def create_dataset_zarr(fp_config):
+    """
+    Create a dataset from the input datasets specified in the config file and write it to a zarr file.
+    The path to the zarr file is the same as the config file, but with the extension changed to '.zarr'.
+
+    Parameters
+    ----------
+    fp_config : Path
+        The path to the configuration file.
+    """
+    config = ConfigDict.load(fp_config=fp_config)
+
+    ds = create_dataset(config=config)
 
     fp_out = fp_config.parent / fp_config.name.replace(".yaml", ".zarr")
     if fp_out.exists():

diff --git a/pdm.lock b/pdm.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,4 +34,5 @@ dev = [
     "pytest>=8.0.2",
     "ipdb>=0.13.13",
     "rich>=13.7.1",
+    "pre-commit>=3.7.1",
 ]
diff --git a/tests/test_from_config.py b/tests/test_from_config.py
@@ -5,7 +5,7 @@
 import pytest
 import yaml
 
-import mllam_data_prep.create_dataset as mdp
+import mllam_data_prep as mdp
 import tests.data as testdata
 from mllam_data_prep.config import InvalidConfigException
 
@@ -79,7 +79,7 @@ def test_merging_static_and_surface_analysis():
     with open(fp_config, "w") as f:
         yaml.dump(config, f)
 
-    mdp.main(fp_config=fp_config)
+    mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 @pytest.mark.parametrize("source_data_contains_time_range", [True, False])
@@ -156,13 +156,13 @@ def test_time_selection(source_data_contains_time_range, time_stepsize):
 
     # run the main function
     if source_data_contains_time_range and time_stepsize == testdata.DT_ANALYSIS:
-        mdp.main(fp_config=fp_config)
+        mdp.create_dataset_zarr(fp_config=fp_config)
     else:
         print(
             f"Expecting ValueError for source_data_contains_time_range={source_data_contains_time_range} and time_stepsize={time_stepsize}"
         )
         with pytest.raises(ValueError):
-            mdp.main(fp_config=fp_config)
+            mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 @pytest.mark.parametrize("use_common_feature_var_name", [True, False])
@@ -241,11 +241,11 @@ def test_feature_collision(use_common_feature_var_name):
 
     if use_common_feature_var_name:
         with pytest.raises(InvalidConfigException):
-            mdp.main(fp_config=fp_config)
+            mdp.create_dataset_zarr(fp_config=fp_config)
     else:
-        mdp.main(fp_config=fp_config)
+        mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 def test_danra_example():
     fp_config = Path(__file__).parent.parent / "example.danra.yaml"
-    mdp.main(fp_config=fp_config)
+    mdp.create_dataset_zarr(fp_config=fp_config)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		# expose the public API
		from .create_dataset import create_dataset, create_dataset_zarr # noqa