Skip to content

Commit

Permalink
separate storing to zarr into separate function (#7)
Browse files Browse the repository at this point in the history
* separate storing to zarr into separate function

* bugfix

* another fix

* do chunking too

* rename zarr function create_dataset_zarr

* fix import

* fix imports 2
  • Loading branch information
leifdenby authored Jun 25, 2024
1 parent e1cf669 commit 0ceaa1c
Show file tree
Hide file tree
Showing 7 changed files with 141 additions and 16 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [unreleased](https://github.com/mllam/mllam-data-prep/compare/v0.1.0...HEAD)

- split dataset creation and storage to zarr into separate functions `mllam_data_prep.create_dataset(...)` and `mllam_data_prep.create_dataset_zarr(...)` respectively ![\#7](https://github.com/mllam/mllam-data-prep/pull/7)

## [v0.1.0](https://github.com/mllam/mllam-data-prep/releases/tag/v0.1.0)

First tagged release of `mllam-data-prep` which includes functionality to
Expand Down
2 changes: 2 additions & 0 deletions mllam_data_prep/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# expose the public API
from .create_dataset import create_dataset, create_dataset_zarr # noqa
4 changes: 2 additions & 2 deletions mllam_data_prep/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

from dask.diagnostics import ProgressBar

from .create_dataset import main
from .create_dataset import create_dataset_zarr

if __name__ == "__main__":
import argparse
Expand All @@ -17,4 +17,4 @@
if args.show_progress:
ProgressBar().register()

main(fp_config=args.config)
create_dataset_zarr(fp_config=args.config)
39 changes: 33 additions & 6 deletions mllam_data_prep/create_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,20 @@ def _merge_dataarrays_by_target(dataarrays_by_target):
return ds


def main(fp_config):
config = ConfigDict.load(fp_config=fp_config)

def create_dataset(config: ConfigDict):
"""
Create a dataset from the input datasets specified in the config file.
Parameters
----------
config : ConfigDict
The configuration file.
Returns
-------
xr.Dataset
The dataset created from the input datasets with a variable for each target architecture variable.
"""
architecture_config = config["architecture"]
architecture_input_ranges = architecture_config.get("input_range", {})

Expand Down Expand Up @@ -150,16 +161,32 @@ def main(fp_config):
dataarrays_by_target[target_arch_var].append(da_target)

ds = _merge_dataarrays_by_target(dataarrays_by_target=dataarrays_by_target)

# need to drop the encoding so that we can write to zarr with new chunksizes
ds = ds.drop_encoding()

# default to making a single chunk for each dimension if chunksize is not specified
# in the config
config_chunking = architecture_config.get("chunking", {})
chunks = {d: config_chunking.get(d, int(ds[d].count())) for d in ds.dims}
chunking_config = config["architecture"].get("chunking", {})
chunks = {d: chunking_config.get(d, int(ds[d].count())) for d in ds.dims}
ds = ds.chunk(chunks)

print(ds)
return ds


def create_dataset_zarr(fp_config):
"""
Create a dataset from the input datasets specified in the config file and write it to a zarr file.
The path to the zarr file is the same as the config file, but with the extension changed to '.zarr'.
Parameters
----------
fp_config : Path
The path to the configuration file.
"""
config = ConfigDict.load(fp_config=fp_config)

ds = create_dataset(config=config)

fp_out = fp_config.parent / fp_config.name.replace(".yaml", ".zarr")
if fp_out.exists():
Expand Down
93 changes: 92 additions & 1 deletion pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ dev = [
"pytest>=8.0.2",
"ipdb>=0.13.13",
"rich>=13.7.1",
"pre-commit>=3.7.1",
]
14 changes: 7 additions & 7 deletions tests/test_from_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import pytest
import yaml

import mllam_data_prep.create_dataset as mdp
import mllam_data_prep as mdp
import tests.data as testdata
from mllam_data_prep.config import InvalidConfigException

Expand Down Expand Up @@ -79,7 +79,7 @@ def test_merging_static_and_surface_analysis():
with open(fp_config, "w") as f:
yaml.dump(config, f)

mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)


@pytest.mark.parametrize("source_data_contains_time_range", [True, False])
Expand Down Expand Up @@ -156,13 +156,13 @@ def test_time_selection(source_data_contains_time_range, time_stepsize):

# run the main function
if source_data_contains_time_range and time_stepsize == testdata.DT_ANALYSIS:
mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)
else:
print(
f"Expecting ValueError for source_data_contains_time_range={source_data_contains_time_range} and time_stepsize={time_stepsize}"
)
with pytest.raises(ValueError):
mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)


@pytest.mark.parametrize("use_common_feature_var_name", [True, False])
Expand Down Expand Up @@ -241,11 +241,11 @@ def test_feature_collision(use_common_feature_var_name):

if use_common_feature_var_name:
with pytest.raises(InvalidConfigException):
mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)
else:
mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)


def test_danra_example():
fp_config = Path(__file__).parent.parent / "example.danra.yaml"
mdp.main(fp_config=fp_config)
mdp.create_dataset_zarr(fp_config=fp_config)

0 comments on commit 0ceaa1c

Please sign in to comment.