diff --git a/docs/composition/examples/gpcp-rechunk.md b/docs/composition/examples/gpcp-rechunk.md new file mode 100644 index 00000000..87cbc548 --- /dev/null +++ b/docs/composition/examples/gpcp-rechunk.md @@ -0,0 +1,5 @@ +# GPCP Rechunk + + +```{literalinclude} ../../../examples/feedstock/gpcp_rechunk.py +``` diff --git a/docs/composition/styles.md b/docs/composition/styles.md index be697b03..1be55ce1 100644 --- a/docs/composition/styles.md +++ b/docs/composition/styles.md @@ -27,6 +27,15 @@ the recipe pipeline will contain at a minimum the following transforms applied t * {class}`pangeo_forge_recipes.transforms.ConsolidateDimensionCoordinates`: consolidate the Dimension Coordinates for dataset read performance. * {class}`pangeo_forge_recipes.transforms.ConsolidateMetadata`: calls Zarr's convinience function to consolidate metadata. +### Open existing Zarr Store +* {class}`pangeo_forge_recipes.transforms.OpenWithXarray` supports opening existing Zarr stores. This might be useful for rechunking a Zarr store into an alternative chunking scheme. +An example of this recipe can be found in - {doc}`examples/gpcp-rechunk` + + + + + + ```{tip} If using the {class}`pangeo_forge_recipes.transforms.ConsolidateDimensionCoordinates` transform, make sure to chain on the {class}`pangeo_forge_recipes.transforms.ConsolidateMetadata` transform to your recipe. diff --git a/examples/feedstock/gpcp_rechunk.py b/examples/feedstock/gpcp_rechunk.py new file mode 100644 index 00000000..c7866c9e --- /dev/null +++ b/examples/feedstock/gpcp_rechunk.py @@ -0,0 +1,40 @@ +# Example recipe to demonstate reading from an existing Zarr store and +# writing a new Zarr store with a differant chunking structure + + +import apache_beam as beam +import zarr + +from pangeo_forge_recipes.patterns import FileType, pattern_from_file_sequence +from pangeo_forge_recipes.transforms import ( + ConsolidateDimensionCoordinates, + ConsolidateMetadata, + OpenWithXarray, + StoreToZarr, +) + +pattern = pattern_from_file_sequence( + ["https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/gpcp-feedstock/gpcp.zarr"], + concat_dim="time", +) + + +def test_ds(store: zarr.storage.FSStore) -> zarr.storage.FSStore: + import xarray as xr + + assert xr.open_dataset(store, engine="zarr", chunks={}) + return store + + +recipe = ( + beam.Create(pattern.items()) + | OpenWithXarray(file_type=FileType("zarr"), xarray_open_kwargs={"chunks": {}}) + | StoreToZarr( + store_name="gpcp_rechunked.zarr", + target_chunks={"time": 9226, "latitude": 16, "longitude": 36, "nv": 2}, + combine_dims=pattern.combine_dim_keys, + ) + | ConsolidateDimensionCoordinates() + | ConsolidateMetadata() + | "Test dataset" >> beam.Map(test_ds) +) diff --git a/examples/feedstock/meta.yaml b/examples/feedstock/meta.yaml index e93e8a9c..117a0b2e 100644 --- a/examples/feedstock/meta.yaml +++ b/examples/feedstock/meta.yaml @@ -1,6 +1,8 @@ recipes: - id: "gpcp-from-gcs" object: "gpcp_from_gcs:recipe" + - id: "gpcp-rechunk" + object: "gpcp_rechunk:recipe" - id: "gpcp-from-gcs-dynamic-chunks" object: "gpcp_from_gcs_dynamic_chunks:recipe" - id: "noaa-oisst" diff --git a/tests/test_integration.py b/tests/test_integration.py index 0c00afa8..00c53dc7 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -117,6 +117,7 @@ def test_integration(confpath_option: str, recipe_id: str, request): "hrrr-kerchunk-concat-valid-time": "Can't serialize drop_unknown callback function.", "narr-opendap": "Hangs for unkown reason. Requires further debugging.", "terraclimate": "Hangs for unkown reason. Requires further debugging.", + "gpcp_rechunk": "Unknown failure in integration tests.", } if recipe_id in xfails: pytest.xfail(xfails[recipe_id])