google
diff --git a/Diff for: ‎.github/workflows/ci-build.yml
+8-4 b/Diff for: ‎.github/workflows/ci-build.yml
+8-4
diff --git a/Diff for: ‎README.md
+38-38 b/Diff for: ‎README.md
+38-38
diff --git a/Diff for: ‎examples/__init__.py
+13 b/Diff for: ‎examples/__init__.py
+13
diff --git a/Diff for: ‎examples/era5_climatology.py
+14-9 b/Diff for: ‎examples/era5_climatology.py
+14-9
diff --git a/Diff for: ‎examples/era5_climatology_test.py
+50 b/Diff for: ‎examples/era5_climatology_test.py
+50
diff --git a/Diff for: ‎examples/era5_rechunk.py
+6-15 b/Diff for: ‎examples/era5_rechunk.py
+6-15
diff --git a/Diff for: ‎examples/era5_rechunk_test.py
+50 b/Diff for: ‎examples/era5_rechunk_test.py
+50
diff --git a/Diff for: ‎setup.py
+14-3 b/Diff for: ‎setup.py
+14-3
@@ -37,10 +37,14 @@ jobs:
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('**/setup.py') }}
-    - name: Install dependencies
+    - name: Install Xarray-Beam
       run: |
-        pip install dask rechunker apache_beam zarr xarray absl-py pytest
-    - name: Test with pytest
+        pip install -e .[tests]
+    - name: Run unit tests
       run: |
-        pip install -e .
         pytest xarray_beam
+    - name: Run example tests
+      # The examples define some of the same flags, so we run pytest in separate processes.
+      run: |
+        pytest examples/era5_climatology_test.py
+        pytest examples/era5_rechunk_test.py
@@ -7,15 +7,15 @@ Xarray-Beam is a Python library for building
 The project aims to facilitate data transformations and analysis on large-scale
 multi-dimensional labeled arrays, such as:
 
-- Ad-hoc computation on Xarray data, by dividing a `xarray.Dataset` into many
-  smaller pieces ("chunks").
-- Adjusting array chunks, using the
-  [Rechunker algorithm](https://rechunker.readthedocs.io/en/latest/algorithm.html)
-- Ingesting large multi-dimensional array datasets into an analysis-ready,
-  cloud-optimized format, namely [Zarr](https://zarr.readthedocs.io/) (see also
-  [Pangeo Forge](https://github.com/pangeo-forge/pangeo-forge-recipes))
-- Calculating statistics (e.g., "climatology") across distributed datasets with
-  arbitrary groups.
+-   Ad-hoc computation on Xarray data, by dividing a `xarray.Dataset` into many
+    smaller pieces ("chunks").
+-   Adjusting array chunks, using the
+    [Rechunker algorithm](https://rechunker.readthedocs.io/en/latest/algorithm.html)
+-   Ingesting large multi-dimensional array datasets into an analysis-ready,
+    cloud-optimized format, namely [Zarr](https://zarr.readthedocs.io/) (see
+    also [Pangeo Forge](https://github.com/pangeo-forge/pangeo-forge-recipes))
+-   Calculating statistics (e.g., "climatology") across distributed datasets
+    with arbitrary groups.
 
 Xarray-Beam is implemented as a _thin layer_ on top of existing libraries for
 working with large-scale Xarray datasets. For example, it leverages
@@ -35,29 +35,29 @@ from early adopters, and hope to have it ready for wider audience soon.
 We love Dask! Xarray-Beam explores a different part of the design space for
 distributed data pipelines than Xarray's built-in Dask integration:
 
-- Xarray-Beam is built around explicit manipulation of
-  `(ChunkKey, xarray.Dataset)` pairs to perform operations on distributed
-  datasets, where `ChunkKey` is an immutable dict keeping track of the offsets
-  from the origin for a small contiguous "chunk" of a larger distributed
-  dataset. This requires more boilerplate but is also more robust than
-  generating distributed computation graphs in Dask using Xarray's built-in API.
-  The user is expected to have a mental model for how their data pipeline is
-  distributed across many machines.
-- Xarray-Beam distributes datasets by splitting them into many `xarray.Dataset`
-  chunks, rather than the chunks of NumPy arrays typically used by Xarray with
-  Dask (unless using
-  [xarray.map_blocks](http://xarray.pydata.org/en/stable/user-guide/dask.html#automatic-parallelization-with-apply-ufunc-and-map-blocks)).
-  Chunks of datasets is a more convenient data-model for writing ad-hoc
-  whole dataset transformations, but is potentially a bit less efficient.
-- Beam ([like Spark](https://docs.dask.org/en/latest/spark.html)) was designed
-  around a higher-level model for distributed computation than Dask (although
-  Dask has been making
-  [progress in this direction](https://coiled.io/blog/dask-under-the-hood-scheduler-refactor/)).
-  Roughly speaking, this trade-off favors scalability over flexibility.
-- Beam allows for executing distributed computation using multiple runners,
-  notably including Google Cloud Dataflow and Apache Spark. These runners are
-  more mature than Dask, and in many cases are supported as a service by major
-  commercial cloud providers.
+-   Xarray-Beam is built around explicit manipulation of `(xarray_beam.Key,
+    xarray.Dataset)` pairs to perform operations on distributed datasets, where
+    `Key` is an immutable dict keeping track of the offsets from the origin for
+    a small contiguous "chunk" of a larger distributed dataset. This requires
+    more boilerplate but is also more robust than generating distributed
+    computation graphs in Dask using Xarray's built-in API. The user is expected
+    to have a mental model for how their data pipeline is distributed across
+    many machines.
+-   Xarray-Beam distributes datasets by splitting them into many
+    `xarray.Dataset` chunks, rather than the chunks of NumPy arrays typically
+    used by Xarray with Dask (unless using
+    [xarray.map_blocks](http://xarray.pydata.org/en/stable/user-guide/dask.html#automatic-parallelization-with-apply-ufunc-and-map-blocks)).
+    Chunks of datasets is a more convenient data-model for writing ad-hoc whole
+    dataset transformations, but is potentially a bit less efficient.
+-   Beam ([like Spark](https://docs.dask.org/en/latest/spark.html)) was designed
+    around a higher-level model for distributed computation than Dask (although
+    Dask has been making
+    [progress in this direction](https://coiled.io/blog/dask-under-the-hood-scheduler-refactor/)).
+    Roughly speaking, this trade-off favors scalability over flexibility.
+-   Beam allows for executing distributed computation using multiple runners,
+    notably including Google Cloud Dataflow and Apache Spark. These runners are
+    more mature than Dask, and in many cases are supported as a service by major
+    commercial cloud providers.
 
 ![Xarray-Beam datamodel vs Xarray-Dask](./static/xarray-beam-vs-xarray-dask.png)
 
@@ -70,9 +70,9 @@ representation similar to that used by dask.array.
 
 ## Getting started
 
-Xarray-Beam requires recent versions of xarray, dask, rechunker and zarr. It
-needs the latest release of Apache Beam (2.31.0 or later). For good performance
-when writing Zarr files, we strongly recommend patching Xarray with
+Xarray-Beam requires recent versions of immutabledict, xarray, dask, rechunker
+and zarr. It needs the latest release of Apache Beam (2.31.0 or later). For good
+performance when writing Zarr files, we strongly recommend patching Xarray with
 [this pull request](https://github.com/pydata/xarray/pull/5252).
 
 TODO(shoyer): write a tutorial here! For now, see the test suite for examples.
@@ -90,6 +90,6 @@ See the "Contribution guidelines" for more.
 
 Contributors:
 
-- Stephan Hoyer
-- Jason Hickey
-- Cenk Gazen
+-   Stephan Hoyer
+-   Jason Hickey
+-   Cenk Gazen
@@ -0,0 +1,13 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
@@ -12,12 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Calculate climatology for the Pangeo ERA5 surface dataset."""
+from typing import Tuple
+
 from absl import app
 from absl import flags
 import apache_beam as beam
 import numpy as np
 import xarray
-import xarray_beam
+import xarray_beam as xbeam
 
 
 INPUT_PATH = flags.DEFINE_string('input_path', None, help='Input Zarr path')
@@ -29,10 +31,13 @@
 # pylint: disable=expression-not-assigned
 
 
-def rekey_chunk_on_month_hour(key, dataset):
+def rekey_chunk_on_month_hour(
+    key: xbeam.Key, dataset: xarray.Dataset,
+) -> Tuple[xbeam.Key, xarray.Dataset]:
+  """Replace the 'time' dimension with 'month'/'hour'."""
   month = dataset.time.dt.month.item()
   hour = dataset.time.dt.hour.item()
-  new_key = key - {'time'} | {'month': month - 1, 'hour': hour}
+  new_key = key.with_offsets(time=None, month=month - 1, hour=hour)
   new_dataset = (
       dataset
       .squeeze('time', drop=True)
@@ -52,24 +57,24 @@ def main(argv):
   # pipeline. We don't really need to supply a template here because the outputs
   # are small (the template argument in ChunksToZarr is optional), but it makes
   # the pipeline slightly more efficient.
+  max_month = source_dataset.time.dt.month.max().item()  # normally 12
   template = (
       source_dataset
       .isel(time=0, drop=True)
       .pipe(xarray.zeros_like)  # don't load even time=0 into memory
-      .expand_dims(month=np.arange(12)+1, hour=np.arange(24))
+      .expand_dims(month=np.arange(1, max_month + 1), hour=np.arange(24))
       .chunk({'hour': 1, 'month': 1})  # make lazy with dask
       .pipe(xarray.zeros_like)  # compress the dask graph
   )
 
   with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
     (
         root
-        | xarray_beam.DatasetToChunks(source_dataset, {'time': 31})
-        | xarray_beam.SplitChunks({'time': 1})
+        | xbeam.DatasetToChunks(source_dataset, {'time': 31})
+        | xbeam.SplitChunks({'time': 1})
         | beam.MapTuple(rekey_chunk_on_month_hour)
-        | xarray_beam.Mean.PerKey(dtype=np.float64)  # avoid overflow
-        | beam.MapTuple(lambda k, v: (k, v.astype(np.float32)))
-        | xarray_beam.ChunksToZarr(OUTPUT_PATH.value, template)
+        | xbeam.Mean.PerKey()
+        | xbeam.ChunksToZarr(OUTPUT_PATH.value, template)
     )
 
 
 
@@ -0,0 +1,50 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for era5_climatology."""
+
+from absl.testing import absltest
+from absl.testing import flagsaver
+import numpy as np
+import pandas as pd
+import xarray
+
+from . import era5_climatology
+from xarray_beam._src import test_util
+
+
+class Era5ClimatologyTest(test_util.TestCase):
+
+  def test(self):
+    input_path = self.create_tempdir('source').full_path
+    output_path = self.create_tempdir('destination').full_path
+
+    input_ds = test_util.dummy_era5_surface_dataset(times=90*24, freq='1H')
+    input_ds.chunk({'time': 31}).to_zarr(input_path)
+
+    expected = input_ds.groupby('time.month').apply(
+        lambda x: x.groupby('time.hour').mean('time')
+    )
+
+    with flagsaver.flagsaver(
+        input_path=input_path,
+        output_path=output_path,
+    ):
+      era5_climatology.main([])
+
+    actual = xarray.open_zarr(output_path)
+    xarray.testing.assert_allclose(actual, expected)
+
+
+if __name__ == '__main__':
+  absltest.main()
@@ -16,7 +16,7 @@
 from absl import flags
 import apache_beam as beam
 import xarray
-import xarray_beam
+import xarray_beam as xbeam
 
 
 INPUT_PATH = flags.DEFINE_string('input_path', None, help='Input Zarr path')
@@ -34,28 +34,19 @@ def main(argv):
   )
   template = xarray.zeros_like(source_dataset.chunk())
   source_chunks = {'latitude': -1, 'longitude': -1, 'time': 31}
-  split_chunks = {'latitude': 1440//8, 'longitude': -1, 'time': 31}
   target_chunks = {'latitude': 5, 'longitude': 5, 'time': -1}
 
   with beam.Pipeline(runner=RUNNER.value, argv=argv) as root:
     (
         root
-        | xarray_beam.DatasetToChunks(source_dataset, source_chunks)
-        # add an intermediate splitting, because rechunker complains about
-        # source chunks too big to fit into memory.
-        | xarray_beam.SplitChunks(split_chunks)
-        # TODO(shoyer): split this rechunk per data variable; it currently ends
-        # up producing tiny intermediate chunks (50 KB), which adds significant
-        # overhead.
-        | xarray_beam.Rechunk(
+        | xbeam.DatasetToChunks(source_dataset, source_chunks, split_vars=True)
+        | xbeam.Rechunk(
             source_dataset.sizes,
-            split_chunks,
+            source_chunks,
             target_chunks,
-            itemsize=len(source_dataset.data_vars) * 4,
-        )
-        | xarray_beam.ChunksToZarr(
-            OUTPUT_PATH.value, template, target_chunks,
+            itemsize=4,
         )
+        | xbeam.ChunksToZarr(OUTPUT_PATH.value, template, target_chunks)
     )
 
 
 
@@ -0,0 +1,50 @@
+# Copyright 2021 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for era5_rechunk."""
+
+from absl.testing import absltest
+from absl.testing import flagsaver
+import numpy as np
+import pandas as pd
+import xarray
+
+from . import era5_rechunk
+from xarray_beam._src import test_util
+
+
+class Era5RechunkTest(test_util.TestCase):
+
+  def test(self):
+    input_path = self.create_tempdir('source').full_path
+    output_path = self.create_tempdir('destination').full_path
+
+    input_ds = test_util.dummy_era5_surface_dataset(times=365)
+    input_ds.chunk({'time': 31}).to_zarr(input_path)
+
+    with flagsaver.flagsaver(
+        input_path=input_path,
+        output_path=output_path,
+    ):
+      era5_rechunk.main([])
+
+    output_ds = xarray.open_zarr(output_path)
+    self.assertEqual(
+        {k: v[0] for k, v in output_ds.chunks.items()},
+        {'latitude': 5, 'longitude': 5, 'time': 365}
+    )
+    xarray.testing.assert_identical(input_ds, output_ds)
+
+
+if __name__ == '__main__':
+  absltest.main()
@@ -16,12 +16,23 @@
 import setuptools
 
 
-base_requires = ['apache_beam>=2.31.0', 'dask', 'rechunker', 'zarr', 'xarray']
-tests_requires = ['absl-py', 'pytest']
+base_requires = [
+    'apache_beam>=2.31.0',
+    'dask',
+    'immutabledict',
+    'rechunker',
+    'zarr',
+    'xarray',
+]
+tests_requires = [
+    'absl-py',
+    'pandas',
+    'pytest',
+]
 
 setuptools.setup(
     name='xarray-beam',
-    version='0.0.1',
+    version='0.2.0 ',
     license='Apache 2.0',
     author='Google LLC',
     author_email='[email protected]',