From 98f8e4a6c0488d0098f6406c91e63725b67a545d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Wed, 20 Mar 2024 14:27:35 -0400 Subject: [PATCH 1/5] failing test --- virtualizarr/tests/test_xarray.py | 56 +++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index 2bc00ca..b99dccc 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -72,6 +72,7 @@ def test_equals(self): assert not ds1.equals(ds3) +# TODO refactor these tests by making some fixtures class TestConcat: def test_concat_along_existing_dim(self): # both manifest arrays in this example have the same zarray properties @@ -164,3 +165,58 @@ def test_concat_along_new_dim(self): assert result.data.zarray.fill_value == zarray.fill_value assert result.data.zarray.order == zarray.order assert result.data.zarray.zarr_format == zarray.zarr_format + + def test_concat_dim_coords_along_existing_dim(self): + # both manifest arrays in this example have the same zarray properties + zarray = ZArray( + chunks=(10,), + compressor="zlib", + dtype=np.dtype("int32"), + fill_value=0.0, + filters=None, + order="C", + shape=(20,), + zarr_format=2, + ) + + chunks_dict1 = { + "0": {"path": "foo.nc", "offset": 100, "length": 100}, + "1": {"path": "foo.nc", "offset": 200, "length": 100}, + } + manifest1 = ChunkManifest(entries=chunks_dict1) + marr1 = ManifestArray(zarray=zarray, chunkmanifest=manifest1) + coords = xr.Coordinates({"t": (["t"], marr1)}, indexes={}) + ds1 = xr.Dataset(coords=coords) + + chunks_dict2 = { + "0": {"path": "foo.nc", "offset": 300, "length": 100}, + "1": {"path": "foo.nc", "offset": 400, "length": 100}, + } + manifest2 = ChunkManifest(entries=chunks_dict2) + marr2 = ManifestArray(zarray=zarray, chunkmanifest=manifest2) + coords = xr.Coordinates({"t": (["t"], marr2)}, indexes={}) + ds2 = xr.Dataset(coords=coords) + + # TODO this fails because xr.concat tries to rebuild indexes on dimension coordinates automatically + # should there be a join='ignore' option? + result = xr.concat([ds1, ds2], dim="t", coords="minimal", compat="override")[ + "a" + ] + + # TODO this also causes a separate problem + # TypeError: Could not find a Chunk Manager which recognises type + # result = xr.concat([ds1, ds2], dim="x")["a"] + + assert result.shape == (40,) + assert result.chunks == (10,) + assert result.data.manifest.dict() == { + "0": {"path": "foo.nc", "offset": 100, "length": 100}, + "1": {"path": "foo.nc", "offset": 200, "length": 100}, + "2": {"path": "foo.nc", "offset": 300, "length": 100}, + "3": {"path": "foo.nc", "offset": 400, "length": 100}, + } + assert result.data.zarray.compressor == zarray.compressor + assert result.data.zarray.filters == zarray.filters + assert result.data.zarray.fill_value == zarray.fill_value + assert result.data.zarray.order == zarray.order + assert result.data.zarray.zarr_format == zarray.zarr_format From 962a618399645a66c57b331652e26d2e5757eb3f Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 11:30:15 -0400 Subject: [PATCH 2/5] test passes with correct version of xarray --- virtualizarr/tests/test_xarray.py | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py index b99dccc..4355d03 100644 --- a/virtualizarr/tests/test_xarray.py +++ b/virtualizarr/tests/test_xarray.py @@ -167,6 +167,9 @@ def test_concat_along_new_dim(self): assert result.data.zarray.zarr_format == zarray.zarr_format def test_concat_dim_coords_along_existing_dim(self): + # Tests that dimension coordinates don't automatically get new indexes on concat + # See https://github.com/pydata/xarray/issues/8871 + # both manifest arrays in this example have the same zarray properties zarray = ZArray( chunks=(10,), @@ -197,15 +200,7 @@ def test_concat_dim_coords_along_existing_dim(self): coords = xr.Coordinates({"t": (["t"], marr2)}, indexes={}) ds2 = xr.Dataset(coords=coords) - # TODO this fails because xr.concat tries to rebuild indexes on dimension coordinates automatically - # should there be a join='ignore' option? - result = xr.concat([ds1, ds2], dim="t", coords="minimal", compat="override")[ - "a" - ] - - # TODO this also causes a separate problem - # TypeError: Could not find a Chunk Manager which recognises type - # result = xr.concat([ds1, ds2], dim="x")["a"] + result = xr.concat([ds1, ds2], dim="t")["t"] assert result.shape == (40,) assert result.chunks == (10,) From 30784f900f58cee0e9e4c6d3578e6889c56d2ced Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 11:45:43 -0400 Subject: [PATCH 3/5] use fork of xarray in tests --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 82b0044..0a6dc45 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -15,4 +15,4 @@ dependencies: - ujson - pydantic - pip: - - xarray>=2024.02.0.dev0 + - git+https://github.com/TomNicholas/xarray.git@concat-no-indexes#egg=xarray \ No newline at end of file From ef5ec14f3afcc43ea6b153fe27e6d86b894d0847 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 25 Mar 2024 15:45:55 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ci/environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/environment.yml b/ci/environment.yml index 0a6dc45..876c9a5 100644 --- a/ci/environment.yml +++ b/ci/environment.yml @@ -15,4 +15,4 @@ dependencies: - ujson - pydantic - pip: - - git+https://github.com/TomNicholas/xarray.git@concat-no-indexes#egg=xarray \ No newline at end of file + - git+https://github.com/TomNicholas/xarray.git@concat-no-indexes#egg=xarray From 85472bce92ed36abb49bc7f4f0107ee7cca8649d Mon Sep 17 00:00:00 2001 From: TomNicholas Date: Mon, 25 Mar 2024 12:15:20 -0400 Subject: [PATCH 5/5] note in readme about required branch of xarray --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0f91224..3e0bc4b 100644 --- a/README.md +++ b/README.md @@ -23,6 +23,7 @@ Currently you need to clone VirtualiZarr and install it locally: git clone virtualizarr pip install -e . ``` +You will also need a specific branch of xarray in order for concatenation without indexes to work. (See [this comment](https://github.com/TomNicholas/VirtualiZarr/issues/14#issuecomment-2018369470).) You may want to install the dependencies using the `virtualizarr/ci/environment.yml` conda file, which includes the specific branch of xarray required. ### Usage