From 1799817d35efd2444673316f0a7724e9a4ba2752 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 4 Jun 2024 16:27:41 +0100 Subject: [PATCH 1/7] separate storing to zarr into separate function --- mllam_data_prep/create_dataset.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 56c4bc3..cc3bc41 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -80,9 +80,7 @@ def _merge_dataarrays_by_target(dataarrays_by_target): return ds -def main(fp_config): - config = ConfigDict.load(fp_config=fp_config) - +def create_dataset(config: ConfigDict): architecture_config = config["architecture"] architecture_input_ranges = architecture_config.get("input_range", {}) @@ -150,14 +148,23 @@ def main(fp_config): dataarrays_by_target[target_arch_var].append(da_target) ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target) - # need to drop the encoding so that we can write to zarr with new chunksizes - ds = ds.drop_encoding() - - # default to making a single chunk for each dimension if chunksize is not specified - # in the config - config_chunking = architecture_config.get("chunking", {}) - chunks = {d: config_chunking.get(d, int(ds[d].count())) for d in ds.dims} - ds = ds.chunk(chunks) + + +def main(fp_config): + config = ConfigDict.load(fp_config=fp_config) + + ds = create_dataset(config=config) + + chunking_config = config["architecture"].get("chunking", {}) + + if chunking_config != {}: + # need to drop the encoding so that we can write to zarr with new chunksizes + ds = ds.drop_encoding() + + # default to making a single chunk for each dimension if chunksize is not specified + # in the config + chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} + ds = ds.chunk(chunks) print(ds) From 34683b46897d766297abdd30eae6a253d7fcbeda Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 4 Jun 2024 16:29:30 +0100 Subject: [PATCH 2/7] bugfix --- mllam_data_prep/create_dataset.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index cc3bc41..b4fcc3a 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -149,6 +149,8 @@ def create_dataset(config: ConfigDict): ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target) + return ds + def main(fp_config): config = ConfigDict.load(fp_config=fp_config) From 4e8b4b004d85b3f1a205af0311ef7d00d6395724 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 4 Jun 2024 16:31:01 +0100 Subject: [PATCH 3/7] another fix --- mllam_data_prep/create_dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index b4fcc3a..5f59285 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -149,6 +149,9 @@ def create_dataset(config: ConfigDict): ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target) + # need to drop the encoding so that we can write to zarr with new chunksizes + ds = ds.drop_encoding() + return ds @@ -160,9 +163,6 @@ def main(fp_config): chunking_config = config["architecture"].get("chunking", {}) if chunking_config != {}: - # need to drop the encoding so that we can write to zarr with new chunksizes - ds = ds.drop_encoding() - # default to making a single chunk for each dimension if chunksize is not specified # in the config chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} From 5a2e10c857a928ab22dd3d1cb9535f50b0d0baf3 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Wed, 5 Jun 2024 11:16:20 +0100 Subject: [PATCH 4/7] do chunking too --- mllam_data_prep/create_dataset.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index 5f59285..e6fd05b 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -152,6 +152,12 @@ def create_dataset(config: ConfigDict): # need to drop the encoding so that we can write to zarr with new chunksizes ds = ds.drop_encoding() + # default to making a single chunk for each dimension if chunksize is not specified + # in the config + chunking_config = config["architecture"].get("chunking", {}) + chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} + ds = ds.chunk(chunks) + return ds @@ -160,16 +166,6 @@ def main(fp_config): ds = create_dataset(config=config) - chunking_config = config["architecture"].get("chunking", {}) - - if chunking_config != {}: - # default to making a single chunk for each dimension if chunksize is not specified - # in the config - chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims} - ds = ds.chunk(chunks) - - print(ds) - fp_out = fp_config.parent / fp_config.name.replace(".yaml", ".zarr") if fp_out.exists(): logger.info(f"Removing existing dataset at {fp_out}") From a02405a182521e768d20475722cbbdf3d28fd10b Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 25 Jun 2024 08:53:20 +0200 Subject: [PATCH 5/7] rename zarr function create_dataset_zarr --- CHANGELOG.md | 4 ++ mllam_data_prep/__init__.py | 1 + mllam_data_prep/__main__.py | 4 +- mllam_data_prep/create_dataset.py | 24 +++++++- pdm.lock | 93 ++++++++++++++++++++++++++++++- pyproject.toml | 1 + tests/test_from_config.py | 12 ++-- 7 files changed, 129 insertions(+), 10 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8144f3d..179828d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [unreleased](https://github.com/mllam/mllam-data-prep/compare/v0.1.0...HEAD) + +- split dataset creation and storage to zarr into separate functions `mllam_data_prep.create_dataset(...)` and `mllam_data_prep.create_dataset_zarr(...)` respectively ![\#7](https://github.com/mllam/mllam-data-prep/pull/7) + ## [v0.1.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.1.0) First tagged release of `mllam-data-prep` which includes functionality to diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py index e69de29..28399ec 100644 --- a/mllam_data_prep/__init__.py +++ b/mllam_data_prep/__init__.py @@ -0,0 +1 @@ +from .create_dataset import create_dataset # noqa diff --git a/mllam_data_prep/__main__.py b/mllam_data_prep/__main__.py index ed7d7e7..4faca7f 100644 --- a/mllam_data_prep/__main__.py +++ b/mllam_data_prep/__main__.py @@ -2,7 +2,7 @@ from dask.diagnostics import ProgressBar -from .create_dataset import main +from .create_dataset import create_dataset_zarr if __name__ == "__main__": import argparse @@ -17,4 +17,4 @@ if args.show_progress: ProgressBar().register() - main(fp_config=args.config) + create_dataset_zarr(fp_config=args.config) diff --git a/mllam_data_prep/create_dataset.py b/mllam_data_prep/create_dataset.py index e6fd05b..256f8cd 100644 --- a/mllam_data_prep/create_dataset.py +++ b/mllam_data_prep/create_dataset.py @@ -81,6 +81,19 @@ def _merge_dataarrays_by_target(dataarrays_by_target): def create_dataset(config: ConfigDict): + """ + Create a dataset from the input datasets specified in the config file. + + Parameters + ---------- + config : ConfigDict + The configuration file. + + Returns + ------- + xr.Dataset + The dataset created from the input datasets with a variable for each target architecture variable. + """ architecture_config = config["architecture"] architecture_input_ranges = architecture_config.get("input_range", {}) @@ -161,7 +174,16 @@ def create_dataset(config: ConfigDict): return ds -def main(fp_config): +def create_dataset_zarr(fp_config): + """ + Create a dataset from the input datasets specified in the config file and write it to a zarr file. + The path to the zarr file is the same as the config file, but with the extension changed to '.zarr'. + + Parameters + ---------- + fp_config : Path + The path to the configuration file. + """ config = ConfigDict.load(fp_config=fp_config) ds = create_dataset(config=config) diff --git a/pdm.lock b/pdm.lock index 2e1e30c..03c161b 100644 --- a/pdm.lock +++ b/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "dev"] strategy = ["cross_platform"] lock_version = "4.4.1" -content_hash = "sha256:6f1e3aac9edfa92b330cc0d3c226074fab5100b4443f1c6f243a8b162588138e" +content_hash = "sha256:f383ea64249a42ea354a36779f3da8116cff518581d1d896386c374a3d443ab3" [[package]] name = "aiohttp" @@ -132,6 +132,16 @@ files = [ {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, ] +[[package]] +name = "cfgv" +version = "3.4.0" +requires_python = ">=3.8" +summary = "Validate configuration and produce human readable error messages." +files = [ + {file = "cfgv-3.4.0-py2.py3-none-any.whl", hash = "sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9"}, + {file = "cfgv-3.4.0.tar.gz", hash = "sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560"}, +] + [[package]] name = "charset-normalizer" version = "3.3.2" @@ -250,6 +260,15 @@ files = [ {file = "decorator-5.1.1.tar.gz", hash = "sha256:637996211036b6385ef91435e4fae22989472f9d571faba8927ba8253acbc330"}, ] +[[package]] +name = "distlib" +version = "0.3.8" +summary = "Distribution utilities" +files = [ + {file = "distlib-0.3.8-py2.py3-none-any.whl", hash = "sha256:034db59a0b96f8ca18035f36290806a9a6e6bd9d1ff91e45a7f172eb17e51784"}, + {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"}, +] + [[package]] name = "exceptiongroup" version = "1.2.0" @@ -280,6 +299,16 @@ files = [ {file = "fasteners-0.19.tar.gz", hash = "sha256:b4f37c3ac52d8a445af3a66bce57b33b5e90b97c696b7b984f530cf8f0ded09c"}, ] +[[package]] +name = "filelock" +version = "3.15.4" +requires_python = ">=3.8" +summary = "A platform independent file lock." +files = [ + {file = "filelock-3.15.4-py3-none-any.whl", hash = "sha256:6ca1fffae96225dab4c6eaf1c4f4f28cd2568d3ec2a44e15a08520504de468e7"}, + {file = "filelock-3.15.4.tar.gz", hash = "sha256:2207938cbc1844345cb01a5a95524dae30f0ce089eba5b00378295a17e3e90cb"}, +] + [[package]] name = "frozenlist" version = "1.4.1" @@ -345,6 +374,16 @@ files = [ {file = "fsspec-2024.2.0.tar.gz", hash = "sha256:b6ad1a679f760dda52b1168c859d01b7b80648ea6f7f7c7f5a8a91dc3f3ecb84"}, ] +[[package]] +name = "identify" +version = "2.5.36" +requires_python = ">=3.8" +summary = "File identification library for Python" +files = [ + {file = "identify-2.5.36-py2.py3-none-any.whl", hash = "sha256:37d93f380f4de590500d9dba7db359d0d3da95ffe7f9de1753faa159e71e7dfa"}, + {file = "identify-2.5.36.tar.gz", hash = "sha256:e5e00f54165f9047fbebeb4a560f9acfb8af4c88232be60a488e9b68d122745d"}, +] + [[package]] name = "idna" version = "3.6" @@ -557,6 +596,16 @@ files = [ {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"}, ] +[[package]] +name = "nodeenv" +version = "1.9.1" +requires_python = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +summary = "Node.js virtual environment builder" +files = [ + {file = "nodeenv-1.9.1-py2.py3-none-any.whl", hash = "sha256:ba11c9782d29c27c70ffbdda2d7415098754709be8a7056d79a737cd901155c9"}, + {file = "nodeenv-1.9.1.tar.gz", hash = "sha256:6ec12890a2dab7946721edbfbcd91f3319c6ccc9aec47be7c7e6b7011ee6645f"}, +] + [[package]] name = "numcodecs" version = "0.12.1" @@ -701,6 +750,16 @@ files = [ {file = "pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f"}, ] +[[package]] +name = "platformdirs" +version = "4.2.2" +requires_python = ">=3.8" +summary = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." +files = [ + {file = "platformdirs-4.2.2-py3-none-any.whl", hash = "sha256:2d7a1657e36a80ea911db832a8a6ece5ee53d8de21edd5cc5879af6530b1bfee"}, + {file = "platformdirs-4.2.2.tar.gz", hash = "sha256:38b7b51f512eed9e84a22788b4bce1de17c0adb134d6becb09836e37d8654cd3"}, +] + [[package]] name = "pluggy" version = "1.4.0" @@ -711,6 +770,23 @@ files = [ {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, ] +[[package]] +name = "pre-commit" +version = "3.7.1" +requires_python = ">=3.9" +summary = "A framework for managing and maintaining multi-language pre-commit hooks." +dependencies = [ + "cfgv>=2.0.0", + "identify>=1.0.0", + "nodeenv>=0.11.1", + "pyyaml>=5.1", + "virtualenv>=20.10.0", +] +files = [ + {file = "pre_commit-3.7.1-py2.py3-none-any.whl", hash = "sha256:fae36fd1d7ad7d6a5a1c0b0d5adb2ed1a3bda5a21bf6c3e5372073d7a11cd4c5"}, + {file = "pre_commit-3.7.1.tar.gz", hash = "sha256:8ca3ad567bc78a4972a3f1a477e94a79d4597e8140a6e0b651c5e33899c3654a"}, +] + [[package]] name = "prompt-toolkit" version = "3.0.43" @@ -928,6 +1004,21 @@ files = [ {file = "urllib3-2.2.1.tar.gz", hash = "sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19"}, ] +[[package]] +name = "virtualenv" +version = "20.26.3" +requires_python = ">=3.7" +summary = "Virtual Python Environment builder" +dependencies = [ + "distlib<1,>=0.3.7", + "filelock<4,>=3.12.2", + "platformdirs<5,>=3.9.1", +] +files = [ + {file = "virtualenv-20.26.3-py3-none-any.whl", hash = "sha256:8cc4a31139e796e9a7de2cd5cf2489de1217193116a8fd42328f1bd65f434589"}, + {file = "virtualenv-20.26.3.tar.gz", hash = "sha256:4c43a2a236279d9ea36a0d76f98d84bd6ca94ac4e0f4a3b9d46d05e10fea542a"}, +] + [[package]] name = "wcwidth" version = "0.2.13" diff --git a/pyproject.toml b/pyproject.toml index 2863145..5402637 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,4 +34,5 @@ dev = [ "pytest>=8.0.2", "ipdb>=0.13.13", "rich>=13.7.1", + "pre-commit>=3.7.1", ] diff --git a/tests/test_from_config.py b/tests/test_from_config.py index 7fd0cac..7e5d0ef 100644 --- a/tests/test_from_config.py +++ b/tests/test_from_config.py @@ -79,7 +79,7 @@ def test_merging_static_and_surface_analysis(): with open(fp_config, "w") as f: yaml.dump(config, f) - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) @pytest.mark.parametrize("source_data_contains_time_range", [True, False]) @@ -156,13 +156,13 @@ def test_time_selection(source_data_contains_time_range, time_stepsize): # run the main function if source_data_contains_time_range and time_stepsize == testdata.DT_ANALYSIS: - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) else: print( f"Expecting ValueError for source_data_contains_time_range={source_data_contains_time_range} and time_stepsize={time_stepsize}" ) with pytest.raises(ValueError): - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) @pytest.mark.parametrize("use_common_feature_var_name", [True, False]) @@ -241,11 +241,11 @@ def test_feature_collision(use_common_feature_var_name): if use_common_feature_var_name: with pytest.raises(InvalidConfigException): - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) else: - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) def test_danra_example(): fp_config = Path(__file__).parent.parent / "example.danra.yaml" - mdp.main(fp_config=fp_config) + mdp.create_dataset_zarr(fp_config=fp_config) From 76f0c4a5033208c1612d17e8dc2c5ec9e151ccf5 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 25 Jun 2024 08:56:31 +0200 Subject: [PATCH 6/7] fix import --- tests/test_from_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_from_config.py b/tests/test_from_config.py index 7e5d0ef..65485a8 100644 --- a/tests/test_from_config.py +++ b/tests/test_from_config.py @@ -5,7 +5,7 @@ import pytest import yaml -import mllam_data_prep.create_dataset as mdp +import mllam_data_prep as mdp import tests.data as testdata from mllam_data_prep.config import InvalidConfigException From 5e4b3f407ad9334228c5ec661d7a4f70f5cd2ba6 Mon Sep 17 00:00:00 2001 From: Leif Denby Date: Tue, 25 Jun 2024 08:59:33 +0200 Subject: [PATCH 7/7] fix imports 2 --- mllam_data_prep/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mllam_data_prep/__init__.py b/mllam_data_prep/__init__.py index 28399ec..44dd160 100644 --- a/mllam_data_prep/__init__.py +++ b/mllam_data_prep/__init__.py @@ -1 +1,2 @@ -from .create_dataset import create_dataset # noqa +# expose the public API +from .create_dataset import create_dataset, create_dataset_zarr # noqa