From 1799817d35efd2444673316f0a7724e9a4ba2752 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 4 Jun 2024 16:27:41 +0100
Subject: [PATCH 1/7] separate storing to zarr into separate function

---
 mllam_data_prep/create_dataset.py | 29 ++++++++++++++++++-----------
 1 file changed, 18 insertions(+), 11 deletions(-)

diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
index 56c4bc3..cc3bc41 100644
--- a/mllam_data_prep/create_dataset.py
+++ b/mllam_data_prep/create_dataset.py
@@ -80,9 +80,7 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
     return ds
 
 
-def main(fp_config):
-    config = ConfigDict.load(fp_config=fp_config)
-
+def create_dataset(config: ConfigDict):
     architecture_config = config["architecture"]
     architecture_input_ranges = architecture_config.get("input_range", {})
 
@@ -150,14 +148,23 @@ def main(fp_config):
         dataarrays_by_target[target_arch_var].append(da_target)
 
     ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target)
-    # need to drop the encoding so that we can write to zarr with new chunksizes
-    ds = ds.drop_encoding()
-
-    # default to making a single chunk for each dimension if chunksize is not specified
-    # in the config
-    config_chunking = architecture_config.get("chunking", {})
-    chunks = {d: config_chunking.get(d, int(ds[d].count())) for d in ds.dims}
-    ds = ds.chunk(chunks)
+
+
+def main(fp_config):
+    config = ConfigDict.load(fp_config=fp_config)
+
+    ds = create_dataset(config=config)
+
+    chunking_config = config["architecture"].get("chunking", {})
+
+    if chunking_config != {}:
+        # need to drop the encoding so that we can write to zarr with new chunksizes
+        ds = ds.drop_encoding()
+
+        # default to making a single chunk for each dimension if chunksize is not specified
+        # in the config
+        chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}
+        ds = ds.chunk(chunks)
 
     print(ds)
 

From 34683b46897d766297abdd30eae6a253d7fcbeda Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 4 Jun 2024 16:29:30 +0100
Subject: [PATCH 2/7] bugfix

---
 mllam_data_prep/create_dataset.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
index cc3bc41..b4fcc3a 100644
--- a/mllam_data_prep/create_dataset.py
+++ b/mllam_data_prep/create_dataset.py
@@ -149,6 +149,8 @@ def create_dataset(config: ConfigDict):
 
     ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target)
 
+    return ds
+
 
 def main(fp_config):
     config = ConfigDict.load(fp_config=fp_config)

From 4e8b4b004d85b3f1a205af0311ef7d00d6395724 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 4 Jun 2024 16:31:01 +0100
Subject: [PATCH 3/7] another fix

---
 mllam_data_prep/create_dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
index b4fcc3a..5f59285 100644
--- a/mllam_data_prep/create_dataset.py
+++ b/mllam_data_prep/create_dataset.py
@@ -149,6 +149,9 @@ def create_dataset(config: ConfigDict):
 
     ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target)
 
+    # need to drop the encoding so that we can write to zarr with new chunksizes
+    ds = ds.drop_encoding()
+
     return ds
 
 
@@ -160,9 +163,6 @@ def main(fp_config):
     chunking_config = config["architecture"].get("chunking", {})
 
     if chunking_config != {}:
-        # need to drop the encoding so that we can write to zarr with new chunksizes
-        ds = ds.drop_encoding()
-
         # default to making a single chunk for each dimension if chunksize is not specified
         # in the config
         chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}

From 5a2e10c857a928ab22dd3d1cb9535f50b0d0baf3 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Wed, 5 Jun 2024 11:16:20 +0100
Subject: [PATCH 4/7] do chunking too

---
 mllam_data_prep/create_dataset.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
index 5f59285..e6fd05b 100644
--- a/mllam_data_prep/create_dataset.py
+++ b/mllam_data_prep/create_dataset.py
@@ -152,6 +152,12 @@ def create_dataset(config: ConfigDict):
     # need to drop the encoding so that we can write to zarr with new chunksizes
     ds = ds.drop_encoding()
 
+    # default to making a single chunk for each dimension if chunksize is not specified
+    # in the config
+    chunking_config = config["architecture"].get("chunking", {})
+    chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}
+    ds = ds.chunk(chunks)
+
     return ds
 
 
@@ -160,16 +166,6 @@ def main(fp_config):
 
     ds = create_dataset(config=config)
 
-    chunking_config = config["architecture"].get("chunking", {})
-
-    if chunking_config != {}:
-        # default to making a single chunk for each dimension if chunksize is not specified
-        # in the config
-        chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}
-        ds = ds.chunk(chunks)
-
-    print(ds)
-
     fp_out = fp_config.parent / fp_config.name.replace(".yaml", ".zarr")
     if fp_out.exists():
         logger.info(f"Removing existing dataset at {fp_out}")

From a02405a182521e768d20475722cbbdf3d28fd10b Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 25 Jun 2024 08:53:20 +0200
Subject: [PATCH 5/7] rename zarr function create_dataset_zarr

---
 CHANGELOG.md                      |  4 ++
 mllam_data_prep/__init__.py       |  1 +
 mllam_data_prep/__main__.py       |  4 +-
 mllam_data_prep/create_dataset.py | 24 +++++++-
 pdm.lock                          | 93 ++++++++++++++++++++++++++++++-
 pyproject.toml                    |  1 +
 tests/test_from_config.py         | 12 ++--
 7 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 8144f3d..179828d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [unreleased](https://github.com/mllam/mllam-data-prep/compare/v0.1.0...HEAD)
+
+- split dataset creation and storage to zarr into separate functions `mllam_data_prep.create_dataset(...)` and `mllam_data_prep.create_dataset_zarr(...)` respectively ![\#7](https://github.com/mllam/mllam-data-prep/pull/7)
+
 ## [v0.1.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.1.0)
 
 First tagged release of `mllam-data-prep` which includes functionality to
diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py
index e69de29..28399ec 100644
--- a/mllam_data_prep/__init__.py
+++ b/mllam_data_prep/__init__.py
@@ -0,0 +1 @@
+from .create_dataset import create_dataset  # noqa
diff --git a/mllam_data_prep/__main__.py b/mllam_data_prep/__main__.py
index ed7d7e7..4faca7f 100644
--- a/mllam_data_prep/__main__.py
+++ b/mllam_data_prep/__main__.py
@@ -2,7 +2,7 @@
 
 from dask.diagnostics import ProgressBar
 
-from .create_dataset import main
+from .create_dataset import create_dataset_zarr
 
 if __name__ == "__main__":
     import argparse
@@ -17,4 +17,4 @@
     if args.show_progress:
         ProgressBar().register()
 
-    main(fp_config=args.config)
+    create_dataset_zarr(fp_config=args.config)
diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py
index e6fd05b..256f8cd 100644
--- a/mllam_data_prep/create_dataset.py
+++ b/mllam_data_prep/create_dataset.py
@@ -81,6 +81,19 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
 
 
 def create_dataset(config: ConfigDict):
+    """
+    Create a dataset from the input datasets specified in the config file.
+
+    Parameters
+    ----------
+    config : ConfigDict
+        The configuration file.
+
+    Returns
+    -------
+    xr.Dataset
+        The dataset created from the input datasets with a variable for each target architecture variable.
+    """
     architecture_config = config["architecture"]
     architecture_input_ranges = architecture_config.get("input_range", {})
 
@@ -161,7 +174,16 @@ def create_dataset(config: ConfigDict):
     return ds
 
 
-def main(fp_config):
+def create_dataset_zarr(fp_config):
+    """
+    Create a dataset from the input datasets specified in the config file and write it to a zarr file.
+    The path to the zarr file is the same as the config file, but with the extension changed to '.zarr'.
+
+    Parameters
+    ----------
+    fp_config : Path
+        The path to the configuration file.
+    """
     config = ConfigDict.load(fp_config=fp_config)
 
     ds = create_dataset(config=config)
diff --git a/pdm.lock b/pdm.lock
index 2e1e30c..03c161b 100644
--- a/pdm.lock
+++ b/pdm.lock
@@ -5,7 +5,7 @@
 groups = ["default", "dev"]
 strategy = ["cross_platform"]
 lock_version = "4.4.1"
-content_hash = "sha256:6f1e3aac9edfa92b330cc0d3c226074fab5100b4443f1c6f243a8b162588138e"
+content_hash = "sha256:f383ea64249a42ea354a36779f3da8116cff518581d1d896386c374a3d443ab3"
 
 [[package]]
 name = "aiohttp"
@@ -132,6 +132,16 @@ files = [
     {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"},
 ]
 
+[[package]]
+name = "cfgv"
+version = "3.4.0"
+requires_python = ">=3.8"
+summary = "Validate configuration and produce human readable error messages."
+files = [
+    {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"},
+    {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"},
+]
+
 [[package]]
 name = "charset-normalizer"
 version = "3.3.2"
@@ -250,6 +260,15 @@ files = [
     {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"},
 ]
 
+[[package]]
+name = "distlib"
+version = "0.3.8"
+summary = "Distribution utilities"
+files = [
+    {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"},
+    {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
+]
+
 [[package]]
 name = "exceptiongroup"
 version = "1.2.0"
@@ -280,6 +299,16 @@ files = [
     {file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"},
 ]
 
+[[package]]
+name = "filelock"
+version = "3.15.4"
+requires_python = ">=3.8"
+summary = "A platform independent file lock."
+files = [
+    {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"},
+    {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"},
+]
+
 [[package]]
 name = "frozenlist"
 version = "1.4.1"
@@ -345,6 +374,16 @@ files = [
     {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"},
 ]
 
+[[package]]
+name = "identify"
+version = "2.5.36"
+requires_python = ">=3.8"
+summary = "File identification library for Python"
+files = [
+    {file = "identify-2.5.36-py2.py3-none-any.whl", hash = "sha256:37d93f380f4de590500d9dba7db359d0d3da95ffe7f9de1753faa159e71e7dfa"},
+    {file = "identify-2.5.36.tar.gz", hash = "sha256:e5e00f54165f9047fbebeb4a560f9acfb8af4c88232be60a488e9b68d122745d"},
+]
+
 [[package]]
 name = "idna"
 version = "3.6"
@@ -557,6 +596,16 @@ files = [
     {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
+[[package]]
+name = "nodeenv"
+version = "1.9.1"
+requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7"
+summary = "Node.js virtual environment builder"
+files = [
+    {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"},
+    {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"},
+]
+
 [[package]]
 name = "numcodecs"
 version = "0.12.1"
@@ -701,6 +750,16 @@ files = [
     {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"},
 ]
 
+[[package]]
+name = "platformdirs"
+version = "4.2.2"
+requires_python = ">=3.8"
+summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
+files = [
+    {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"},
+    {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"},
+]
+
 [[package]]
 name = "pluggy"
 version = "1.4.0"
@@ -711,6 +770,23 @@ files = [
     {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"},
 ]
 
+[[package]]
+name = "pre-commit"
+version = "3.7.1"
+requires_python = ">=3.9"
+summary = "A framework for managing and maintaining multi-language pre-commit hooks."
+dependencies = [
+    "cfgv>=2.0.0",
+    "identify>=1.0.0",
+    "nodeenv>=0.11.1",
+    "pyyaml>=5.1",
+    "virtualenv>=20.10.0",
+]
+files = [
+    {file = "pre_commit-3.7.1-py2.py3-none-any.whl", hash = "sha256:fae36fd1d7ad7d6a5a1c0b0d5adb2ed1a3bda5a21bf6c3e5372073d7a11cd4c5"},
+    {file = "pre_commit-3.7.1.tar.gz", hash = "sha256:8ca3ad567bc78a4972a3f1a477e94a79d4597e8140a6e0b651c5e33899c3654a"},
+]
+
 [[package]]
 name = "prompt-toolkit"
 version = "3.0.43"
@@ -928,6 +1004,21 @@ files = [
     {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"},
 ]
 
+[[package]]
+name = "virtualenv"
+version = "20.26.3"
+requires_python = ">=3.7"
+summary = "Virtual Python Environment builder"
+dependencies = [
+    "distlib<1,>=0.3.7",
+    "filelock<4,>=3.12.2",
+    "platformdirs<5,>=3.9.1",
+]
+files = [
+    {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"},
+    {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"},
+]
+
 [[package]]
 name = "wcwidth"
 version = "0.2.13"
diff --git a/pyproject.toml b/pyproject.toml
index 2863145..5402637 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,4 +34,5 @@ dev = [
     "pytest>=8.0.2",
     "ipdb>=0.13.13",
     "rich>=13.7.1",
+    "pre-commit>=3.7.1",
 ]
diff --git a/tests/test_from_config.py b/tests/test_from_config.py
index 7fd0cac..7e5d0ef 100644
--- a/tests/test_from_config.py
+++ b/tests/test_from_config.py
@@ -79,7 +79,7 @@ def test_merging_static_and_surface_analysis():
     with open(fp_config, "w") as f:
         yaml.dump(config, f)
 
-    mdp.main(fp_config=fp_config)
+    mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 @pytest.mark.parametrize("source_data_contains_time_range", [True, False])
@@ -156,13 +156,13 @@ def test_time_selection(source_data_contains_time_range, time_stepsize):
 
     # run the main function
     if source_data_contains_time_range and time_stepsize == testdata.DT_ANALYSIS:
-        mdp.main(fp_config=fp_config)
+        mdp.create_dataset_zarr(fp_config=fp_config)
     else:
         print(
             f"Expecting ValueError for source_data_contains_time_range={source_data_contains_time_range} and time_stepsize={time_stepsize}"
         )
         with pytest.raises(ValueError):
-            mdp.main(fp_config=fp_config)
+            mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 @pytest.mark.parametrize("use_common_feature_var_name", [True, False])
@@ -241,11 +241,11 @@ def test_feature_collision(use_common_feature_var_name):
 
     if use_common_feature_var_name:
         with pytest.raises(InvalidConfigException):
-            mdp.main(fp_config=fp_config)
+            mdp.create_dataset_zarr(fp_config=fp_config)
     else:
-        mdp.main(fp_config=fp_config)
+        mdp.create_dataset_zarr(fp_config=fp_config)
 
 
 def test_danra_example():
     fp_config = Path(__file__).parent.parent / "example.danra.yaml"
-    mdp.main(fp_config=fp_config)
+    mdp.create_dataset_zarr(fp_config=fp_config)

From 76f0c4a5033208c1612d17e8dc2c5ec9e151ccf5 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 25 Jun 2024 08:56:31 +0200
Subject: [PATCH 6/7] fix import

---
 tests/test_from_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_from_config.py b/tests/test_from_config.py
index 7e5d0ef..65485a8 100644
--- a/tests/test_from_config.py
+++ b/tests/test_from_config.py
@@ -5,7 +5,7 @@
 import pytest
 import yaml
 
-import mllam_data_prep.create_dataset as mdp
+import mllam_data_prep as mdp
 import tests.data as testdata
 from mllam_data_prep.config import InvalidConfigException
 

From 5e4b3f407ad9334228c5ec661d7a4f70f5cd2ba6 Mon Sep 17 00:00:00 2001
From: Leif Denby <leif@denby.eu>
Date: Tue, 25 Jun 2024 08:59:33 +0200
Subject: [PATCH 7/7] fix imports 2

---
 mllam_data_prep/__init__.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py
index 28399ec..44dd160 100644
--- a/mllam_data_prep/__init__.py
+++ b/mllam_data_prep/__init__.py
@@ -1 +1,2 @@
-from .create_dataset import create_dataset  # noqa
+# expose the public API
+from .create_dataset import create_dataset, create_dataset_zarr  # noqa