From b32f058413fc6be46aac945d45a04b219bbf5c1c Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 21:14:10 +0200 Subject: [PATCH 01/16] add notebook for chunking methods --- docs/chunking.ipynb | 891 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 891 insertions(+) create mode 100644 docs/chunking.ipynb diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb new file mode 100644 index 00000000..c171dea6 --- /dev/null +++ b/docs/chunking.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96e9149e-fc6d-4048-8e45-a29966e5c6b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from itertools import product\n", + "import numpy as np\n", + "\n", + "import xarray as xr\n", + "import xbitinfo as xb" + ] + }, + { + "cell_type": "markdown", + "id": "b64e0873-0a27-4757-947a-4a559a102288", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320224c9-06e2-428a-8614-8ed0d15eee82", + "metadata": {}, + "outputs": [], + "source": [ + "# load data\n", + "ds = xr.tutorial.load_dataset(\"air_temperature\") \n", + "chunks = {'lat':5,'lon':10} # Defining chunks that will be used for the reading/bitrounding/writing\n", + "ds = ds.chunk(chunks) # Apply chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f3120040-79f1-4a7f-a61f-afec9fb3ca5b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 25, time: 2920, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float32 dask.array<chunksize=(2920, 5, 10), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    title:        4x daily NMC reanalysis (1948)\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 25, time: 2920, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float32 dask.array\n", + "Attributes:\n", + " Conventions: COARDS\n", + " title: 4x daily NMC reanalysis (1948)\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "b9e8fe5a-2e4e-4dfd-8026-0991e9988668", + "metadata": {}, + "source": [ + "## Saving to file" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_11221/350902741.py:1: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " ds.to_netcdf(\"0.air_original.nc\")\n" + ] + } + ], + "source": [ + "ds.to_netcdf(\"0.air_original.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b98628e-cbcb-4018-8565-4c0324cf2d61", + "metadata": {}, + "source": [ + "## Compress with `to_compressed_netcdf`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ayoubf/Projects/xbitinfo/xbitinfo/save_compressed.py:121: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " self._obj.to_netcdf(\n" + ] + } + ], + "source": [ + "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f5aae30-4a0a-401c-9018-9e34626c3d2c", + "metadata": {}, + "source": [ + "## Compress with bitrounding" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eedef4b12a25419fba0f9d7348e619a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00>> slices_from_chunks(((2, 2), (3, 3, 3))) # doctest: +NORMALIZE_WHITESPACE\n", + " [(slice(0, 2, None), slice(0, 3, None)),\n", + " (slice(0, 2, None), slice(3, 6, None)),\n", + " (slice(0, 2, None), slice(6, 9, None)),\n", + " (slice(2, 4, None), slice(0, 3, None)),\n", + " (slice(2, 4, None), slice(3, 6, None)),\n", + " (slice(2, 4, None), slice(6, 9, None))]\n", + " \"\"\"\n", + " cumdims = []\n", + " for bds in chunks:\n", + " out = np.empty(len(bds)+1, dtype=int)\n", + " out[0] = 0\n", + " np.cumsum(bds, out=out[1:])\n", + " cumdims.append(out)\n", + " slices = [\n", + " [slice(s, s + dim) for s, dim in zip(starts, shapes)]\n", + " for starts, shapes in zip(cumdims, chunks)\n", + " ]\n", + " return list(product(*slices))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", + "metadata": {}, + "outputs": [], + "source": [ + "fn = 'air.zarr' # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode='w') # Creates empty file structure" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7221b47f-b8f4-4ebf-bc2b-cb61d12989be", + "metadata": {}, + "outputs": [], + "source": [ + "dims = ds.air.dims\n", + "len_dims = len(dims)\n", + "slices = slices_from_chunks(ds.air.chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7c2ed18f-4dc8-4f5c-88ed-ae5ad41d1647", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", + " #slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + " ds_block = xr.Dataset({'air':(dims, block.compute())}) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", + " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", + " rounded_ds.to_zarr(fn, region={dims[d]:s for (d,s) in enumerate(slices[b])}) # Write individual chunk to disk" + ] + }, + { + "cell_type": "markdown", + "id": "9ae3603f-291d-4c7f-92a8-95d9935daf35", + "metadata": {}, + "source": [ + "## Creating smaller datasets as chunks and compressing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1d53f86f-fa72-4161-a364-8c1f78dba6d6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "at_least_zero = lambda x: max(x, 0)\n", + "\n", + "chunk_long, chunk_lat = [10, 5] # for int division\n", + "var = 'lat'\n", + "\n", + "dss = []\n", + "dss_bitrounded = []\n", + "dss_kbits = []\n", + "\n", + "long_c = int(ds.lon.size / chunk_long)\n", + "lat_c = int(ds.lat.size / chunk_lat)\n", + "\n", + "for i in range(long_c):\n", + " for j in range(lat_c):\n", + " temp_ds = ds.isel(lon=slice(i*chunk_long, (i+1)*chunk_long),\n", + " lat=slice(j*chunk_lat, (j+1)*chunk_lat))\n", + " dss.append(temp_ds)\n", + " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=var, implementation=\"python\")\n", + " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", + " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", + " dss_kbits.append(temp_keepbits)\n", + " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", + " dss_bitrounded.append(temp_ds_bitrounded)\n", + "\n", + " if i == 0 and j == 0 : \n", + " MERGED_ds_bitr = temp_ds_bitrounded\n", + " else:\n", + " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e1c4b35-4767-47e7-9ddf-357250b631b8", + "metadata": {}, + "outputs": [], + "source": [ + "MERGED_ds_bitr.to_compressed_netcdf(\"3.air_chunked_bitr_compressed.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3b60c66-252d-48a6-af93-a00c9ca8f0ba", + "metadata": {}, + "source": [ + "## ALL" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "998581b5-6ad9-4f6f-9c61-d0bf1486ec7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.5M\t0.air_original.nc\n", + "1.7M\t1.air_compressed_all.nc\n", + "1.3M\t2.air_bitrounded_compressed.nc\n", + "776K\t3.air_chunked_bitr_compressed.nc\n", + "1.1M\tair.zarr\n" + ] + } + ], + "source": [ + "!du -hs *.nc *.zarr" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:bitinfo] *", + "language": "python", + "name": "conda-env-bitinfo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 69598b06fd1361c9bb65200c2507dbda132b27c7 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 21:14:39 +0200 Subject: [PATCH 02/16] add chunking entry in docs --- docs/index.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/index.rst b/docs/index.rst index 5c476353..068b79e7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -96,6 +96,17 @@ Credits quick-start.ipynb +**Chunking** + +* :doc:`chunking` + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Chunking + + chunking.ipynb + **Help & Reference** * :doc:`api` From 428a1b679446ca88f53833eb6018b45778fce6a6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 14 Oct 2023 19:15:59 +0000 Subject: [PATCH 03/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/chunking.ipynb | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index c171dea6..0e1476a5 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -40,8 +40,11 @@ "outputs": [], "source": [ "# load data\n", - "ds = xr.tutorial.load_dataset(\"air_temperature\") \n", - "chunks = {'lat':5,'lon':10} # Defining chunks that will be used for the reading/bitrounding/writing\n", + "ds = xr.tutorial.load_dataset(\"air_temperature\")\n", + "chunks = {\n", + " \"lat\": 5,\n", + " \"lon\": 10,\n", + "} # Defining chunks that will be used for the reading/bitrounding/writing\n", "ds = ds.chunk(chunks) # Apply chunking" ] }, @@ -703,7 +706,7 @@ "metadata": {}, "outputs": [], "source": [ - "def bitrounding(chunk, var='lat'):\n", + "def bitrounding(chunk, var=\"lat\"):\n", " \"\"\"\n", " Just a function that handles all the xbitinfo calls\n", " \"\"\"\n", @@ -712,8 +715,9 @@ " bitround = xb.xr_bitround(chunk, keepbits)\n", " return bitround\n", "\n", + "\n", "def slices_from_chunks(chunks):\n", - " \"\"\" Translate chunks tuple to a set of slices in product order\n", + " \"\"\"Translate chunks tuple to a set of slices in product order\n", "\n", " >>> slices_from_chunks(((2, 2), (3, 3, 3))) # doctest: +NORMALIZE_WHITESPACE\n", " [(slice(0, 2, None), slice(0, 3, None)),\n", @@ -725,7 +729,7 @@ " \"\"\"\n", " cumdims = []\n", " for bds in chunks:\n", - " out = np.empty(len(bds)+1, dtype=int)\n", + " out = np.empty(len(bds) + 1, dtype=int)\n", " out[0] = 0\n", " np.cumsum(bds, out=out[1:])\n", " cumdims.append(out)\n", @@ -743,8 +747,8 @@ "metadata": {}, "outputs": [], "source": [ - "fn = 'air.zarr' # Output filename\n", - "ds.to_compressed_zarr(fn, compute=False, mode='w') # Creates empty file structure" + "fn = \"air.zarr\" # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode=\"w\") # Creates empty file structure" ] }, { @@ -768,10 +772,14 @@ "source": [ "%%capture\n", "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", - " #slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", - " ds_block = xr.Dataset({'air':(dims, block.compute())}) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", + " # slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + " ds_block = xr.Dataset(\n", + " {\"air\": (dims, block.compute())}\n", + " ) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", - " rounded_ds.to_zarr(fn, region={dims[d]:s for (d,s) in enumerate(slices[b])}) # Write individual chunk to disk" + " rounded_ds.to_zarr(\n", + " fn, region={dims[d]: s for (d, s) in enumerate(slices[b])}\n", + " ) # Write individual chunk to disk" ] }, { @@ -796,8 +804,8 @@ "\n", "at_least_zero = lambda x: max(x, 0)\n", "\n", - "chunk_long, chunk_lat = [10, 5] # for int division\n", - "var = 'lat'\n", + "chunk_long, chunk_lat = [10, 5] # for int division\n", + "var = \"lat\"\n", "\n", "dss = []\n", "dss_bitrounded = []\n", @@ -808,17 +816,21 @@ "\n", "for i in range(long_c):\n", " for j in range(lat_c):\n", - " temp_ds = ds.isel(lon=slice(i*chunk_long, (i+1)*chunk_long),\n", - " lat=slice(j*chunk_lat, (j+1)*chunk_lat))\n", + " temp_ds = ds.isel(\n", + " lon=slice(i * chunk_long, (i + 1) * chunk_long),\n", + " lat=slice(j * chunk_lat, (j + 1) * chunk_lat),\n", + " )\n", " dss.append(temp_ds)\n", - " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=var, implementation=\"python\")\n", + " temp_info_pbit = xb.get_bitinformation(\n", + " temp_ds, dim=var, implementation=\"python\"\n", + " )\n", " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", " dss_kbits.append(temp_keepbits)\n", " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", " dss_bitrounded.append(temp_ds_bitrounded)\n", "\n", - " if i == 0 and j == 0 : \n", + " if i == 0 and j == 0:\n", " MERGED_ds_bitr = temp_ds_bitrounded\n", " else:\n", " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" From 1a1100739892333ea7e9c6fab2a04515d7d8b8e0 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 22:59:47 +0200 Subject: [PATCH 04/16] change nb metadata to avoid CI failure --- docs/chunking.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index c171dea6..1fa31d4d 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -869,9 +869,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:bitinfo] *", + "display_name": "Python 3", "language": "python", - "name": "conda-env-bitinfo-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -883,8 +883,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" - } + "version": "3.10.4" + }, + "toc-autonumbering": true }, "nbformat": 4, "nbformat_minor": 5 From f5c56a53d32a1b622ac4227b048842b3164b21b9 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 23:15:43 +0200 Subject: [PATCH 05/16] add title to nb --- docs/chunking.ipynb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index e81a4a3e..166031b5 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "a1f40619", + "metadata": {}, + "source": [ + "# Chunking" + ] + }, { "cell_type": "markdown", "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", From e9eee1cfa9b593ee848aa70acea0660fc311d0e5 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 21:14:10 +0200 Subject: [PATCH 06/16] add notebook for chunking methods --- docs/chunking.ipynb | 891 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 891 insertions(+) create mode 100644 docs/chunking.ipynb diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb new file mode 100644 index 00000000..c171dea6 --- /dev/null +++ b/docs/chunking.ipynb @@ -0,0 +1,891 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", + "metadata": {}, + "source": [ + "## Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "96e9149e-fc6d-4048-8e45-a29966e5c6b8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from itertools import product\n", + "import numpy as np\n", + "\n", + "import xarray as xr\n", + "import xbitinfo as xb" + ] + }, + { + "cell_type": "markdown", + "id": "b64e0873-0a27-4757-947a-4a559a102288", + "metadata": {}, + "source": [ + "## Data loading" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "320224c9-06e2-428a-8614-8ed0d15eee82", + "metadata": {}, + "outputs": [], + "source": [ + "# load data\n", + "ds = xr.tutorial.load_dataset(\"air_temperature\") \n", + "chunks = {'lat':5,'lon':10} # Defining chunks that will be used for the reading/bitrounding/writing\n", + "ds = ds.chunk(chunks) # Apply chunking" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f3120040-79f1-4a7f-a61f-afec9fb3ca5b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
<xarray.Dataset>\n",
+       "Dimensions:  (lat: 25, time: 2920, lon: 53)\n",
+       "Coordinates:\n",
+       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
+       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
+       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
+       "Data variables:\n",
+       "    air      (time, lat, lon) float32 dask.array<chunksize=(2920, 5, 10), meta=np.ndarray>\n",
+       "Attributes:\n",
+       "    Conventions:  COARDS\n",
+       "    title:        4x daily NMC reanalysis (1948)\n",
+       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
+       "    platform:     Model\n",
+       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
" + ], + "text/plain": [ + "\n", + "Dimensions: (lat: 25, time: 2920, lon: 53)\n", + "Coordinates:\n", + " * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n", + " * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n", + " * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n", + "Data variables:\n", + " air (time, lat, lon) float32 dask.array\n", + "Attributes:\n", + " Conventions: COARDS\n", + " title: 4x daily NMC reanalysis (1948)\n", + " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", + " platform: Model\n", + " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ds" + ] + }, + { + "cell_type": "markdown", + "id": "b9e8fe5a-2e4e-4dfd-8026-0991e9988668", + "metadata": {}, + "source": [ + "## Saving to file" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_11221/350902741.py:1: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " ds.to_netcdf(\"0.air_original.nc\")\n" + ] + } + ], + "source": [ + "ds.to_netcdf(\"0.air_original.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b98628e-cbcb-4018-8565-4c0324cf2d61", + "metadata": {}, + "source": [ + "## Compress with `to_compressed_netcdf`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/ayoubf/Projects/xbitinfo/xbitinfo/save_compressed.py:121: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + " self._obj.to_netcdf(\n" + ] + } + ], + "source": [ + "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f5aae30-4a0a-401c-9018-9e34626c3d2c", + "metadata": {}, + "source": [ + "## Compress with bitrounding" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "eedef4b12a25419fba0f9d7348e619a2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00>> slices_from_chunks(((2, 2), (3, 3, 3))) # doctest: +NORMALIZE_WHITESPACE\n", + " [(slice(0, 2, None), slice(0, 3, None)),\n", + " (slice(0, 2, None), slice(3, 6, None)),\n", + " (slice(0, 2, None), slice(6, 9, None)),\n", + " (slice(2, 4, None), slice(0, 3, None)),\n", + " (slice(2, 4, None), slice(3, 6, None)),\n", + " (slice(2, 4, None), slice(6, 9, None))]\n", + " \"\"\"\n", + " cumdims = []\n", + " for bds in chunks:\n", + " out = np.empty(len(bds)+1, dtype=int)\n", + " out[0] = 0\n", + " np.cumsum(bds, out=out[1:])\n", + " cumdims.append(out)\n", + " slices = [\n", + " [slice(s, s + dim) for s, dim in zip(starts, shapes)]\n", + " for starts, shapes in zip(cumdims, chunks)\n", + " ]\n", + " return list(product(*slices))" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", + "metadata": {}, + "outputs": [], + "source": [ + "fn = 'air.zarr' # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode='w') # Creates empty file structure" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "7221b47f-b8f4-4ebf-bc2b-cb61d12989be", + "metadata": {}, + "outputs": [], + "source": [ + "dims = ds.air.dims\n", + "len_dims = len(dims)\n", + "slices = slices_from_chunks(ds.air.chunks)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "7c2ed18f-4dc8-4f5c-88ed-ae5ad41d1647", + "metadata": {}, + "outputs": [], + "source": [ + "%%capture\n", + "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", + " #slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + " ds_block = xr.Dataset({'air':(dims, block.compute())}) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", + " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", + " rounded_ds.to_zarr(fn, region={dims[d]:s for (d,s) in enumerate(slices[b])}) # Write individual chunk to disk" + ] + }, + { + "cell_type": "markdown", + "id": "9ae3603f-291d-4c7f-92a8-95d9935daf35", + "metadata": {}, + "source": [ + "## Creating smaller datasets as chunks and compressing" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1d53f86f-fa72-4161-a364-8c1f78dba6d6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "%%capture\n", + "\n", + "at_least_zero = lambda x: max(x, 0)\n", + "\n", + "chunk_long, chunk_lat = [10, 5] # for int division\n", + "var = 'lat'\n", + "\n", + "dss = []\n", + "dss_bitrounded = []\n", + "dss_kbits = []\n", + "\n", + "long_c = int(ds.lon.size / chunk_long)\n", + "lat_c = int(ds.lat.size / chunk_lat)\n", + "\n", + "for i in range(long_c):\n", + " for j in range(lat_c):\n", + " temp_ds = ds.isel(lon=slice(i*chunk_long, (i+1)*chunk_long),\n", + " lat=slice(j*chunk_lat, (j+1)*chunk_lat))\n", + " dss.append(temp_ds)\n", + " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=var, implementation=\"python\")\n", + " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", + " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", + " dss_kbits.append(temp_keepbits)\n", + " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", + " dss_bitrounded.append(temp_ds_bitrounded)\n", + "\n", + " if i == 0 and j == 0 : \n", + " MERGED_ds_bitr = temp_ds_bitrounded\n", + " else:\n", + " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "7e1c4b35-4767-47e7-9ddf-357250b631b8", + "metadata": {}, + "outputs": [], + "source": [ + "MERGED_ds_bitr.to_compressed_netcdf(\"3.air_chunked_bitr_compressed.nc\")" + ] + }, + { + "cell_type": "markdown", + "id": "d3b60c66-252d-48a6-af93-a00c9ca8f0ba", + "metadata": {}, + "source": [ + "## ALL" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "998581b5-6ad9-4f6f-9c61-d0bf1486ec7f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "7.5M\t0.air_original.nc\n", + "1.7M\t1.air_compressed_all.nc\n", + "1.3M\t2.air_bitrounded_compressed.nc\n", + "776K\t3.air_chunked_bitr_compressed.nc\n", + "1.1M\tair.zarr\n" + ] + } + ], + "source": [ + "!du -hs *.nc *.zarr" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:bitinfo] *", + "language": "python", + "name": "conda-env-bitinfo-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.3" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From face55ab8d36ae176b6c6bb364d700b17b3cc141 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 21:14:39 +0200 Subject: [PATCH 07/16] add chunking entry in docs --- docs/index.rst | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/index.rst b/docs/index.rst index 5c476353..068b79e7 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -96,6 +96,17 @@ Credits quick-start.ipynb +**Chunking** + +* :doc:`chunking` + +.. toctree:: + :maxdepth: 1 + :hidden: + :caption: Chunking + + chunking.ipynb + **Help & Reference** * :doc:`api` From 88e1b74e5485aa95d68900c83ec1e740a3509b77 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 14 Oct 2023 19:15:59 +0000 Subject: [PATCH 08/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/chunking.ipynb | 44 ++++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index c171dea6..0e1476a5 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -40,8 +40,11 @@ "outputs": [], "source": [ "# load data\n", - "ds = xr.tutorial.load_dataset(\"air_temperature\") \n", - "chunks = {'lat':5,'lon':10} # Defining chunks that will be used for the reading/bitrounding/writing\n", + "ds = xr.tutorial.load_dataset(\"air_temperature\")\n", + "chunks = {\n", + " \"lat\": 5,\n", + " \"lon\": 10,\n", + "} # Defining chunks that will be used for the reading/bitrounding/writing\n", "ds = ds.chunk(chunks) # Apply chunking" ] }, @@ -703,7 +706,7 @@ "metadata": {}, "outputs": [], "source": [ - "def bitrounding(chunk, var='lat'):\n", + "def bitrounding(chunk, var=\"lat\"):\n", " \"\"\"\n", " Just a function that handles all the xbitinfo calls\n", " \"\"\"\n", @@ -712,8 +715,9 @@ " bitround = xb.xr_bitround(chunk, keepbits)\n", " return bitround\n", "\n", + "\n", "def slices_from_chunks(chunks):\n", - " \"\"\" Translate chunks tuple to a set of slices in product order\n", + " \"\"\"Translate chunks tuple to a set of slices in product order\n", "\n", " >>> slices_from_chunks(((2, 2), (3, 3, 3))) # doctest: +NORMALIZE_WHITESPACE\n", " [(slice(0, 2, None), slice(0, 3, None)),\n", @@ -725,7 +729,7 @@ " \"\"\"\n", " cumdims = []\n", " for bds in chunks:\n", - " out = np.empty(len(bds)+1, dtype=int)\n", + " out = np.empty(len(bds) + 1, dtype=int)\n", " out[0] = 0\n", " np.cumsum(bds, out=out[1:])\n", " cumdims.append(out)\n", @@ -743,8 +747,8 @@ "metadata": {}, "outputs": [], "source": [ - "fn = 'air.zarr' # Output filename\n", - "ds.to_compressed_zarr(fn, compute=False, mode='w') # Creates empty file structure" + "fn = \"air.zarr\" # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode=\"w\") # Creates empty file structure" ] }, { @@ -768,10 +772,14 @@ "source": [ "%%capture\n", "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", - " #slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", - " ds_block = xr.Dataset({'air':(dims, block.compute())}) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", + " # slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + " ds_block = xr.Dataset(\n", + " {\"air\": (dims, block.compute())}\n", + " ) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", - " rounded_ds.to_zarr(fn, region={dims[d]:s for (d,s) in enumerate(slices[b])}) # Write individual chunk to disk" + " rounded_ds.to_zarr(\n", + " fn, region={dims[d]: s for (d, s) in enumerate(slices[b])}\n", + " ) # Write individual chunk to disk" ] }, { @@ -796,8 +804,8 @@ "\n", "at_least_zero = lambda x: max(x, 0)\n", "\n", - "chunk_long, chunk_lat = [10, 5] # for int division\n", - "var = 'lat'\n", + "chunk_long, chunk_lat = [10, 5] # for int division\n", + "var = \"lat\"\n", "\n", "dss = []\n", "dss_bitrounded = []\n", @@ -808,17 +816,21 @@ "\n", "for i in range(long_c):\n", " for j in range(lat_c):\n", - " temp_ds = ds.isel(lon=slice(i*chunk_long, (i+1)*chunk_long),\n", - " lat=slice(j*chunk_lat, (j+1)*chunk_lat))\n", + " temp_ds = ds.isel(\n", + " lon=slice(i * chunk_long, (i + 1) * chunk_long),\n", + " lat=slice(j * chunk_lat, (j + 1) * chunk_lat),\n", + " )\n", " dss.append(temp_ds)\n", - " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=var, implementation=\"python\")\n", + " temp_info_pbit = xb.get_bitinformation(\n", + " temp_ds, dim=var, implementation=\"python\"\n", + " )\n", " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", " dss_kbits.append(temp_keepbits)\n", " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", " dss_bitrounded.append(temp_ds_bitrounded)\n", "\n", - " if i == 0 and j == 0 : \n", + " if i == 0 and j == 0:\n", " MERGED_ds_bitr = temp_ds_bitrounded\n", " else:\n", " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" From f17fad9fe3edd181d5d371859d516c8d04e5e460 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 22:59:47 +0200 Subject: [PATCH 09/16] change nb metadata to avoid CI failure --- docs/chunking.ipynb | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 0e1476a5..e81a4a3e 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -881,9 +881,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:bitinfo] *", + "display_name": "Python 3", "language": "python", - "name": "conda-env-bitinfo-py" + "name": "python3" }, "language_info": { "codemirror_mode": { @@ -895,8 +895,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" - } + "version": "3.10.4" + }, + "toc-autonumbering": true }, "nbformat": 4, "nbformat_minor": 5 From 6656d7b13fb64749a0e48947a6f1e3160487c5b2 Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Sat, 14 Oct 2023 23:15:43 +0200 Subject: [PATCH 10/16] add title to nb --- docs/chunking.ipynb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index e81a4a3e..166031b5 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -1,5 +1,13 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "a1f40619", + "metadata": {}, + "source": [ + "# Chunking" + ] + }, { "cell_type": "markdown", "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", From 0a8811a930c35bbff314f08553ab776ddf69f019 Mon Sep 17 00:00:00 2001 From: Hauke Schulz <43613877+observingClouds@users.noreply.github.com> Date: Thu, 14 Dec 2023 13:52:01 -0800 Subject: [PATCH 11/16] Rename menu subsection --- docs/index.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/index.rst b/docs/index.rst index 068b79e7..f333a444 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -96,14 +96,14 @@ Credits quick-start.ipynb -**Chunking** +**User Guide** * :doc:`chunking` .. toctree:: :maxdepth: 1 :hidden: - :caption: Chunking + :caption: User Guide chunking.ipynb From 5f4bda7d396be9745fdf07967c8c036bdd76680f Mon Sep 17 00:00:00 2001 From: ayoubft <63267601+ayoubft@users.noreply.github.com> Date: Fri, 15 Dec 2023 17:49:33 +0100 Subject: [PATCH 12/16] add chunks plot + comments --- docs/chunking.ipynb | 276 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 222 insertions(+), 54 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 166031b5..0feb60a1 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -8,6 +8,16 @@ "# Chunking" ] }, + { + "cell_type": "markdown", + "id": "b8e2d4f5-c444-404a-8444-3648cb0a94bf", + "metadata": {}, + "source": [ + "Geospatial data can vary in its information density from one part of the world to another. A dataset containing streets will be very dense in cities but contains little information in remote places like the Alps or even the ocean. The same is also true for datasets about the ocean or the atmosphere.\n", + "\n", + "Currently in the bitinformation framework, to preserve all real information, the maximum information content calculated by `xbitinfo` needs to be used for the entire dataset. However, bitinformation can also be calculated on subsets, such that the ‘boring’ parts can therefore be more efficiently compressed. This notebook portrays how to do it." + ] + }, { "cell_type": "markdown", "id": "e515b4bd-a302-45a9-8464-56b67a73a46c", @@ -44,16 +54,22 @@ "cell_type": "code", "execution_count": 2, "id": "320224c9-06e2-428a-8614-8ed0d15eee82", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "# load data\n", "ds = xr.tutorial.load_dataset(\"air_temperature\")\n", + "\n", + "# Defining chunks that will be used for the reading/bitrounding/writing\n", "chunks = {\n", " \"lat\": 5,\n", " \"lon\": 10,\n", - "} # Defining chunks that will be used for the reading/bitrounding/writing\n", - "ds = ds.chunk(chunks) # Apply chunking" + "}\n", + "\n", + "# Apply chunking\n", + "ds = ds.chunk(chunks) " ] }, { @@ -443,17 +459,17 @@ " title: 4x daily NMC reanalysis (1948)\n", " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", " platform: Model\n", - " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." ], "text/plain": [ @@ -603,25 +619,28 @@ "id": "b9e8fe5a-2e4e-4dfd-8026-0991e9988668", "metadata": {}, "source": [ - "## Saving to file" + "## Saving to `NetCDF` file" ] }, { "cell_type": "code", "execution_count": 4, "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_11221/350902741.py:1: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", + "/tmp/ipykernel_24883/1840452313.py:2: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", " ds.to_netcdf(\"0.air_original.nc\")\n" ] } ], "source": [ + "# Saving the dataset as NetCDF file\n", "ds.to_netcdf(\"0.air_original.nc\")" ] }, @@ -637,7 +656,9 @@ "cell_type": "code", "execution_count": 5, "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stderr", @@ -649,6 +670,7 @@ } ], "source": [ + "# Compress and save the dataset as NetCDF file\n", "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" ] }, @@ -664,12 +686,14 @@ "cell_type": "code", "execution_count": 6, "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "eedef4b12a25419fba0f9d7348e619a2", + "model_id": "d7a346d6cbd14460890ae3be8ec11ff0", "version_major": 2, "version_minor": 0 }, @@ -682,8 +706,13 @@ } ], "source": [ + "# Get bitinformation of the dataset along the 'longitude' dimension\n", "info_per_bit = xb.get_bitinformation(ds, dim=\"lon\", implementation=\"python\")\n", + "\n", + "# Get the number of bits necessary to keep 99% of information in our dataset\n", "keepbits = xb.get_keepbits(info_per_bit, 0.99)\n", + "\n", + "# Round the dataset using the keepbits number\n", "ds_bitrounded = xb.xr_bitround(ds, keepbits)" ] }, @@ -696,6 +725,7 @@ }, "outputs": [], "source": [ + "# Compress and save the bitrounded dataset as NetCDF file\n", "ds_bitrounded.to_compressed_netcdf(\"2.air_bitrounded_compressed.nc\")" ] }, @@ -707,11 +737,19 @@ "## Zarr chunking and compressing" ] }, + { + "cell_type": "markdown", + "id": "e837c725-b7de-4418-a530-113583411884", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": 8, "id": "91343d2a-63ec-4d61-a369-cc99139297e4", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "def bitrounding(chunk, var=\"lat\"):\n", @@ -752,7 +790,9 @@ "cell_type": "code", "execution_count": 9, "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "fn = \"air.zarr\" # Output filename\n", @@ -763,11 +803,14 @@ "cell_type": "code", "execution_count": 10, "id": "7221b47f-b8f4-4ebf-bc2b-cb61d12989be", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "dims = ds.air.dims\n", "len_dims = len(dims)\n", + "\n", "slices = slices_from_chunks(ds.air.chunks)" ] }, @@ -775,19 +818,28 @@ "cell_type": "code", "execution_count": 11, "id": "7c2ed18f-4dc8-4f5c-88ed-ae5ad41d1647", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "%%capture\n", - "for b, block in enumerate(ds.air.data.to_delayed().ravel()): # Loop over each chunk\n", - " # slices = {d:s for (d,s) in zip(dims, block.key[1:1+len_dims])}\n", + "\n", + "# Loop over each chunk\n", + "for b, block in enumerate(ds.air.data.to_delayed().ravel()): \n", + "\n", + " # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", " ds_block = xr.Dataset(\n", " {\"air\": (dims, block.compute())}\n", - " ) # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", - " rounded_ds = bitrounding(ds_block) # Apply bitrounding\n", + " ) \n", + " \n", + " # Apply bitrounding\n", + " rounded_ds = bitrounding(ds_block)\n", + " \n", + " # Write individual chunk to disk\n", " rounded_ds.to_zarr(\n", " fn, region={dims[d]: s for (d, s) in enumerate(slices[b])}\n", - " ) # Write individual chunk to disk" + " )" ] }, { @@ -798,6 +850,12 @@ "## Creating smaller datasets as chunks and compressing" ] }, + { + "cell_type": "markdown", + "id": "4265a4fa-b397-4552-ac3c-a1a358fffcd0", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": 12, @@ -810,45 +868,52 @@ "source": [ "%%capture\n", "\n", + "# Define a lambda function to ensure that the value is at least zero\n", + "# negative keepbits not yet supported\n", "at_least_zero = lambda x: max(x, 0)\n", "\n", - "chunk_long, chunk_lat = [10, 5] # for int division\n", - "var = \"lat\"\n", - "\n", + "# Create empty intermediate holders for plotting later\n", "dss = []\n", "dss_bitrounded = []\n", "dss_kbits = []\n", "\n", - "long_c = int(ds.lon.size / chunk_long)\n", - "lat_c = int(ds.lat.size / chunk_lat)\n", + "# How many chunks there are\n", + "long_c = int(ds.lon.size / chunks['lon'])\n", + "lat_c = int(ds.lat.size / chunks['lat'])\n", "\n", - "for i in range(long_c):\n", - " for j in range(lat_c):\n", - " temp_ds = ds.isel(\n", - " lon=slice(i * chunk_long, (i + 1) * chunk_long),\n", - " lat=slice(j * chunk_lat, (j + 1) * chunk_lat),\n", - " )\n", - " dss.append(temp_ds)\n", - " temp_info_pbit = xb.get_bitinformation(\n", - " temp_ds, dim=var, implementation=\"python\"\n", - " )\n", - " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", - " # temp_keepbits = temp_keepbits.map(at_least_zero)\n", - " dss_kbits.append(temp_keepbits)\n", - " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", - " dss_bitrounded.append(temp_ds_bitrounded)\n", + "for i, j in product(range(long_c), range(lat_c)):\n", + " \n", + " # Extract a chunk of the dataset\n", + " temp_ds = ds.isel(\n", + " lon=slice(i * chunks['lon'], (i + 1) * chunks['lon']),\n", + " lat=slice(j * chunks['lat'], (j + 1) * chunks['lat']),\n", + " )\n", + " dss.append(temp_ds)\n", + " \n", + " # Compress with bitrounding (See details above)\n", + " temp_info_pbit = xb.get_bitinformation(\n", + " temp_ds, dim='lat', implementation=\"python\"\n", + " )\n", + " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", + " temp_keepbits = temp_keepbits.where(temp_keepbits['air'] > 0, 0)\n", + " dss_kbits.append(temp_keepbits)\n", + " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", + " dss_bitrounded.append(temp_ds_bitrounded)\n", "\n", - " if i == 0 and j == 0:\n", - " MERGED_ds_bitr = temp_ds_bitrounded\n", - " else:\n", - " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" + " # Merge the bitrounded datasets\n", + " if i == 0 and j == 0:\n", + " MERGED_ds_bitr = temp_ds_bitrounded\n", + " else:\n", + " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" ] }, { "cell_type": "code", "execution_count": 13, "id": "7e1c4b35-4767-47e7-9ddf-357250b631b8", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "MERGED_ds_bitr.to_compressed_netcdf(\"3.air_chunked_bitr_compressed.nc\")" @@ -856,15 +921,100 @@ }, { "cell_type": "markdown", - "id": "d3b60c66-252d-48a6-af93-a00c9ca8f0ba", + "id": "5d628121-d5ec-4544-a47f-f47c86524b09", "metadata": {}, "source": [ - "## ALL" + "### Plot" ] }, { "cell_type": "code", "execution_count": 14, + "id": "d8835a3c-8af0-4423-baf4-84aa9a386f67", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import matplotlib as mpl\n", + "import matplotlib.patheffects as pe" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "c3b5f657-cddd-4476-82a3-c3c2c1a6e7b6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# # Create a figure and axis and plot the air temperature\n", + "fig, ax = plt.subplots(figsize=(12, 6))\n", + "ds['air'].isel(time=0).plot(ax=ax, cmap='RdBu_r')\n", + "\n", + "for i in range(len(dss_bitrounded)):\n", + " \n", + " # Get chunk limits\n", + " lats = dss[i].lat\n", + " longs = dss[i].lon \n", + " x = float(min(longs[0], longs[-1]))\n", + " y = float(min(lats[0], lats[-1]))\n", + " w = float(abs(longs[0] - longs[-1]))\n", + " h = float(abs(lats[0] - lats[-1]))\n", + " \n", + " # Draw rectangle\n", + " rect = mpl.patches.Rectangle((x, y), width = w, height = h,\n", + " facecolor = \"none\", edgecolor = \"#E5E4E2\",\n", + " path_effects=[pe.withStroke(linewidth=3, foreground=\"gray\")])\n", + " ax.add_patch(rect)\n", + " \n", + " # Annotate number of keepbits\n", + " rx, ry = rect.get_xy()\n", + " cx = rx + rect.get_width()/2.0\n", + " cy = ry + rect.get_height()/2.0\n", + " ax.annotate(f\"{int(dss_kbits[i].air):2}\",\n", + " (cx, cy), color='k', weight='normal', fontsize=14, ha='right', \n", + " va='center', path_effects=[pe.withStroke(linewidth=2, foreground='w')])\n", + "\n", + "fig.text(.39, .94, f'Keepbits ', weight='bold', fontsize=16)\n", + "\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "id": "d3b60c66-252d-48a6-af93-a00c9ca8f0ba", + "metadata": { + "tags": [] + }, + "source": [ + "## Summary" + ] + }, + { + "cell_type": "markdown", + "id": "b28089ea-22f9-45c6-abc9-b65bd946ac66", + "metadata": {}, + "source": [ + "Below are the file sizes resulting from the various compression techniques outlined above." + ] + }, + { + "cell_type": "code", + "execution_count": 16, "id": "998581b5-6ad9-4f6f-9c61-d0bf1486ec7f", "metadata": { "tags": [] @@ -885,11 +1035,29 @@ "source": [ "!du -hs *.nc *.zarr" ] + }, + { + "cell_type": "markdown", + "id": "15c6975d-6909-4e2c-9395-0a64d39ed44f", + "metadata": {}, + "source": [ + "---" + ] + }, + { + "cell_type": "markdown", + "id": "fed34f3f-2bee-45d3-9bdf-1237b77cf1b8", + "metadata": {}, + "source": [ + "In this experiment, the sizes are minimized when applying bitrounding and compression to the dataset chunks. \n", + "\n", + "However, it's important to note that this outcome may not be universally applicable, check this for your dataset." + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, From 70b01af957fe5baf249db797de1ed0e0c3fe953e Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 15 Dec 2023 16:51:04 +0000 Subject: [PATCH 13/16] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/chunking.ipynb | 73 +++++++++++++++++++++++---------------------- 1 file changed, 38 insertions(+), 35 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 0feb60a1..794b4f2b 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -69,7 +69,7 @@ "}\n", "\n", "# Apply chunking\n", - "ds = ds.chunk(chunks) " + "ds = ds.chunk(chunks)" ] }, { @@ -826,20 +826,15 @@ "%%capture\n", "\n", "# Loop over each chunk\n", - "for b, block in enumerate(ds.air.data.to_delayed().ravel()): \n", - "\n", + "for b, block in enumerate(ds.air.data.to_delayed().ravel()):\n", " # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", - " ds_block = xr.Dataset(\n", - " {\"air\": (dims, block.compute())}\n", - " ) \n", - " \n", + " ds_block = xr.Dataset({\"air\": (dims, block.compute())})\n", + "\n", " # Apply bitrounding\n", " rounded_ds = bitrounding(ds_block)\n", - " \n", + "\n", " # Write individual chunk to disk\n", - " rounded_ds.to_zarr(\n", - " fn, region={dims[d]: s for (d, s) in enumerate(slices[b])}\n", - " )" + " rounded_ds.to_zarr(fn, region={dims[d]: s for (d, s) in enumerate(slices[b])})" ] }, { @@ -878,24 +873,21 @@ "dss_kbits = []\n", "\n", "# How many chunks there are\n", - "long_c = int(ds.lon.size / chunks['lon'])\n", - "lat_c = int(ds.lat.size / chunks['lat'])\n", + "long_c = int(ds.lon.size / chunks[\"lon\"])\n", + "lat_c = int(ds.lat.size / chunks[\"lat\"])\n", "\n", "for i, j in product(range(long_c), range(lat_c)):\n", - " \n", " # Extract a chunk of the dataset\n", " temp_ds = ds.isel(\n", - " lon=slice(i * chunks['lon'], (i + 1) * chunks['lon']),\n", - " lat=slice(j * chunks['lat'], (j + 1) * chunks['lat']),\n", + " lon=slice(i * chunks[\"lon\"], (i + 1) * chunks[\"lon\"]),\n", + " lat=slice(j * chunks[\"lat\"], (j + 1) * chunks[\"lat\"]),\n", " )\n", " dss.append(temp_ds)\n", - " \n", + "\n", " # Compress with bitrounding (See details above)\n", - " temp_info_pbit = xb.get_bitinformation(\n", - " temp_ds, dim='lat', implementation=\"python\"\n", - " )\n", + " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=\"lat\", implementation=\"python\")\n", " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", - " temp_keepbits = temp_keepbits.where(temp_keepbits['air'] > 0, 0)\n", + " temp_keepbits = temp_keepbits.where(temp_keepbits[\"air\"] > 0, 0)\n", " dss_kbits.append(temp_keepbits)\n", " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", " dss_bitrounded.append(temp_ds_bitrounded)\n", @@ -963,33 +955,44 @@ "source": [ "# # Create a figure and axis and plot the air temperature\n", "fig, ax = plt.subplots(figsize=(12, 6))\n", - "ds['air'].isel(time=0).plot(ax=ax, cmap='RdBu_r')\n", + "ds[\"air\"].isel(time=0).plot(ax=ax, cmap=\"RdBu_r\")\n", "\n", "for i in range(len(dss_bitrounded)):\n", - " \n", " # Get chunk limits\n", " lats = dss[i].lat\n", - " longs = dss[i].lon \n", + " longs = dss[i].lon\n", " x = float(min(longs[0], longs[-1]))\n", " y = float(min(lats[0], lats[-1]))\n", " w = float(abs(longs[0] - longs[-1]))\n", " h = float(abs(lats[0] - lats[-1]))\n", - " \n", + "\n", " # Draw rectangle\n", - " rect = mpl.patches.Rectangle((x, y), width = w, height = h,\n", - " facecolor = \"none\", edgecolor = \"#E5E4E2\",\n", - " path_effects=[pe.withStroke(linewidth=3, foreground=\"gray\")])\n", + " rect = mpl.patches.Rectangle(\n", + " (x, y),\n", + " width=w,\n", + " height=h,\n", + " facecolor=\"none\",\n", + " edgecolor=\"#E5E4E2\",\n", + " path_effects=[pe.withStroke(linewidth=3, foreground=\"gray\")],\n", + " )\n", " ax.add_patch(rect)\n", - " \n", + "\n", " # Annotate number of keepbits\n", " rx, ry = rect.get_xy()\n", - " cx = rx + rect.get_width()/2.0\n", - " cy = ry + rect.get_height()/2.0\n", - " ax.annotate(f\"{int(dss_kbits[i].air):2}\",\n", - " (cx, cy), color='k', weight='normal', fontsize=14, ha='right', \n", - " va='center', path_effects=[pe.withStroke(linewidth=2, foreground='w')])\n", + " cx = rx + rect.get_width() / 2.0\n", + " cy = ry + rect.get_height() / 2.0\n", + " ax.annotate(\n", + " f\"{int(dss_kbits[i].air):2}\",\n", + " (cx, cy),\n", + " color=\"k\",\n", + " weight=\"normal\",\n", + " fontsize=14,\n", + " ha=\"right\",\n", + " va=\"center\",\n", + " path_effects=[pe.withStroke(linewidth=2, foreground=\"w\")],\n", + " )\n", "\n", - "fig.text(.39, .94, f'Keepbits ', weight='bold', fontsize=16)\n", + "fig.text(0.39, 0.94, f\"Keepbits \", weight=\"bold\", fontsize=16)\n", "\n", "plt.show()" ] From c06fcfb8b0cd2bf025d28ae5214fa3987825de1c Mon Sep 17 00:00:00 2001 From: Hauke Schulz <43613877+observingClouds@users.noreply.github.com> Date: Wed, 27 Dec 2023 18:52:01 -0800 Subject: [PATCH 14/16] reordering of cells; add more description --- docs/chunking.ipynb | 236 ++++++++++++++++++++++---------------------- 1 file changed, 120 insertions(+), 116 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 794b4f2b..56ad538e 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -15,7 +15,9 @@ "source": [ "Geospatial data can vary in its information density from one part of the world to another. A dataset containing streets will be very dense in cities but contains little information in remote places like the Alps or even the ocean. The same is also true for datasets about the ocean or the atmosphere.\n", "\n", - "Currently in the bitinformation framework, to preserve all real information, the maximum information content calculated by `xbitinfo` needs to be used for the entire dataset. However, bitinformation can also be calculated on subsets, such that the ‘boring’ parts can therefore be more efficiently compressed. This notebook portrays how to do it." + "By default the number of bits that need to be kept (`keepbits`) to preserve the requested amount of information is determined based on the entire dataset. This approach doesn't always result in the best compression rates as it preserves too many keepbits in regions with anomalously low information density. The following steps show how the `keepbits` can be retrieved and applied on subsets. In this case, subsets are defined as dataset chunks.\n", + "\n", + "This work is a result of the ECMWF Code4Earth 2023. Please have a look at the [presentation of this project](https://youtu.be/IOi4XvECpsQ?si=hwZkppNRa-J2XVZ9) for additional details." ] }, { @@ -614,121 +616,6 @@ "ds" ] }, - { - "cell_type": "markdown", - "id": "b9e8fe5a-2e4e-4dfd-8026-0991e9988668", - "metadata": {}, - "source": [ - "## Saving to `NetCDF` file" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_24883/1840452313.py:2: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", - " ds.to_netcdf(\"0.air_original.nc\")\n" - ] - } - ], - "source": [ - "# Saving the dataset as NetCDF file\n", - "ds.to_netcdf(\"0.air_original.nc\")" - ] - }, - { - "cell_type": "markdown", - "id": "2b98628e-cbcb-4018-8565-4c0324cf2d61", - "metadata": {}, - "source": [ - "## Compress with `to_compressed_netcdf`" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ayoubf/Projects/xbitinfo/xbitinfo/save_compressed.py:121: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", - " self._obj.to_netcdf(\n" - ] - } - ], - "source": [ - "# Compress and save the dataset as NetCDF file\n", - "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" - ] - }, - { - "cell_type": "markdown", - "id": "5f5aae30-4a0a-401c-9018-9e34626c3d2c", - "metadata": {}, - "source": [ - "## Compress with bitrounding" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a346d6cbd14460890ae3be8ec11ff0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 Date: Tue, 6 Feb 2024 14:04:09 -0800 Subject: [PATCH 15/16] focus on zarr files; fix chunk visualization --- docs/chunking.ipynb | 203 ++++++++++---------------------------------- 1 file changed, 44 insertions(+), 159 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 56ad538e..1c516c65 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -624,12 +624,6 @@ "## Zarr chunking and compressing" ] }, - { - "cell_type": "markdown", - "id": "e837c725-b7de-4418-a530-113583411884", - "metadata": {}, - "source": [] - }, { "cell_type": "code", "execution_count": 8, @@ -646,7 +640,7 @@ " bitinfo = xb.get_bitinformation(chunk, dim=var, implementation=\"python\")\n", " keepbits = xb.get_keepbits(bitinfo, 0.99)\n", " bitround = xb.xr_bitround(chunk, keepbits)\n", - " return bitround\n", + " return bitround, keepbits\n", "\n", "\n", "def slices_from_chunks(chunks):\n", @@ -674,31 +668,13 @@ ] }, { - "cell_type": "code", - "execution_count": 9, - "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "fn = \"air.zarr\" # Output filename\n", - "ds.to_compressed_zarr(fn, compute=False, mode=\"w\") # Creates empty file structure" - ] - }, - { - "cell_type": "code", - "execution_count": 10, + "cell_type": "markdown", "id": "7221b47f-b8f4-4ebf-bc2b-cb61d12989be", "metadata": { "tags": [] }, - "outputs": [], "source": [ - "dims = ds.air.dims\n", - "len_dims = len(dims)\n", - "\n", - "slices = slices_from_chunks(ds.air.chunks)" + "### Save dataset as compressed zarr after compressing individual chunks" ] }, { @@ -710,94 +686,28 @@ }, "outputs": [], "source": [ - "%%capture\n", + "fn = \"air_bitrounded_by_chunks.zarr\" # Output filename\n", + "ds.to_compressed_zarr(fn, compute=False, mode=\"w\") # Creates empty file structure\n", + "\n", + "dims = ds.air.dims\n", + "len_dims = len(dims)\n", + "\n", + "slices = slices_from_chunks(ds.air.chunks)\n", "\n", "# Loop over each chunk\n", + "keepbits = []\n", "for b, block in enumerate(ds.air.data.to_delayed().ravel()):\n", " # Conversion of dask.delayed array to Dataset (as xbitinfo wants type xr.Dataset)\n", " ds_block = xr.Dataset({\"air\": (dims, block.compute())})\n", "\n", " # Apply bitrounding\n", - " rounded_ds = bitrounding(ds_block)\n", + " rounded_ds, keepbit = bitrounding(ds_block)\n", + " keepbits.append(keepbit)\n", "\n", " # Write individual chunk to disk\n", " rounded_ds.to_zarr(fn, region={dims[d]: s for (d, s) in enumerate(slices[b])})" ] }, - { - "cell_type": "markdown", - "id": "9ae3603f-291d-4c7f-92a8-95d9935daf35", - "metadata": {}, - "source": [ - "## Creating smaller datasets as chunks and compressing" - ] - }, - { - "cell_type": "markdown", - "id": "4265a4fa-b397-4552-ac3c-a1a358fffcd0", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "1d53f86f-fa72-4161-a364-8c1f78dba6d6", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], - "source": [ - "%%capture\n", - "\n", - "# Define a lambda function to ensure that the value is at least zero\n", - "# negative keepbits not yet supported\n", - "at_least_zero = lambda x: max(x, 0)\n", - "\n", - "# Create empty intermediate holders for plotting later\n", - "dss = []\n", - "dss_bitrounded = []\n", - "dss_kbits = []\n", - "\n", - "# How many chunks there are\n", - "long_c = int(ds.lon.size / chunks[\"lon\"])\n", - "lat_c = int(ds.lat.size / chunks[\"lat\"])\n", - "\n", - "for i, j in product(range(long_c), range(lat_c)):\n", - " # Extract a chunk of the dataset\n", - " temp_ds = ds.isel(\n", - " lon=slice(i * chunks[\"lon\"], (i + 1) * chunks[\"lon\"]),\n", - " lat=slice(j * chunks[\"lat\"], (j + 1) * chunks[\"lat\"]),\n", - " )\n", - " dss.append(temp_ds)\n", - "\n", - " # Compress with bitrounding (See details above)\n", - " temp_info_pbit = xb.get_bitinformation(temp_ds, dim=\"lat\", implementation=\"python\")\n", - " temp_keepbits = xb.get_keepbits(temp_info_pbit, 0.99)\n", - " temp_keepbits = temp_keepbits.where(temp_keepbits[\"air\"] > 0, 0)\n", - " dss_kbits.append(temp_keepbits)\n", - " temp_ds_bitrounded = xb.xr_bitround(temp_ds, temp_keepbits)\n", - " dss_bitrounded.append(temp_ds_bitrounded)\n", - "\n", - " # Merge the bitrounded datasets\n", - " if i == 0 and j == 0:\n", - " MERGED_ds_bitr = temp_ds_bitrounded\n", - " else:\n", - " MERGED_ds_bitr = xr.merge([MERGED_ds_bitr, temp_ds_bitrounded])" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "7e1c4b35-4767-47e7-9ddf-357250b631b8", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "MERGED_ds_bitr.to_compressed_netcdf(\"3.air_chunked_bitr_compressed.nc\")" - ] - }, { "cell_type": "markdown", "id": "5d628121-d5ec-4544-a47f-f47c86524b09", @@ -842,12 +752,17 @@ "source": [ "# # Create a figure and axis and plot the air temperature\n", "fig, ax = plt.subplots(figsize=(12, 6))\n", - "ds[\"air\"].isel(time=0).plot(ax=ax, cmap=\"RdBu_r\")\n", + "rounded_ds = xr.open_zarr(fn).isel(time=0)\n", + "rounded_ds[\"air\"].plot(ax=ax, cmap=\"RdBu_r\")\n", + "\n", + "slices = slices_from_chunks(rounded_ds.air.chunks)\n", "\n", - "for i in range(len(dss_bitrounded)):\n", + "for i in range(len(slices)):\n", " # Get chunk limits\n", - " lats = dss[i].lat\n", - " longs = dss[i].lon\n", + " dss = rounded_ds.isel(lat=slices[i][0], lon=slices[i][1])\n", + " lats = dss.lat\n", + " longs = dss.lon\n", + "\n", " x = float(min(longs[0], longs[-1]))\n", " y = float(min(lats[0], lats[-1]))\n", " w = float(abs(longs[0] - longs[-1]))\n", @@ -869,7 +784,7 @@ " cx = rx + rect.get_width() / 2.0\n", " cy = ry + rect.get_height() / 2.0\n", " ax.annotate(\n", - " f\"{int(dss_kbits[i].air):2}\",\n", + " f\"{int(keepbits[i].air):2}\",\n", " (cx, cy),\n", " color=\"k\",\n", " weight=\"normal\",\n", @@ -892,8 +807,8 @@ "## Reference compression\n", "For comparision with other compression approaches the dataset is also saved as:\n", "- uncompressed netCDF\n", - "- lossless compressed netCDF\n", - "- lossy compressed netCDF while preserving 99% of bitinformation" + "- lossless compressed zarr\n", + "- lossy compressed zarr while preserving 99% of bitinformation" ] }, { @@ -906,7 +821,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "e011a900-5da2-40be-a292-d81a0cafcd6d", "metadata": { "tags": [] @@ -928,77 +843,47 @@ }, { "cell_type": "markdown", - "id": "2b98628e-cbcb-4018-8565-4c0324cf2d61", + "id": "1cc93427", "metadata": {}, "source": [ - "### Saving as compressed NetCDF with `to_compressed_netcdf`" + "### Save dataset as compressed zarr (without bitrounding)" ] }, { "cell_type": "code", - "execution_count": 5, - "id": "99d02f35-85fc-4a8d-94a0-880ac2ffbb72", + "execution_count": 9, + "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/ayoubf/Projects/xbitinfo/xbitinfo/save_compressed.py:121: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", - " self._obj.to_netcdf(\n" - ] - } - ], + "outputs": [], "source": [ - "# Compress and save the dataset as NetCDF file\n", - "ds.to_compressed_netcdf(\"1.air_compressed_all.nc\")" + "fn = \"air_compressed.zarr\" # Output filename\n", + "ds.to_compressed_zarr(fn, mode=\"w\") # Creates empty file structure" ] }, { "cell_type": "markdown", - "id": "5f5aae30-4a0a-401c-9018-9e34626c3d2c", + "id": "648f759c", "metadata": {}, "source": [ - "### Saving while preserving 99% of information based on bitrounding algorithm" + "### Save dataset as compressed zarr after applying bitrounding" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "fdf077dd-6494-4c38-9461-5ea7ac370a01", + "execution_count": null, + "id": "93eb4cd6", "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d7a346d6cbd14460890ae3be8ec11ff0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/1 [00:00 Date: Tue, 6 Feb 2024 14:23:38 -0800 Subject: [PATCH 16/16] suppress progress bar; remove output of cells --- docs/chunking.ipynb | 590 ++------------------------------------------ 1 file changed, 15 insertions(+), 575 deletions(-) diff --git a/docs/chunking.ipynb b/docs/chunking.ipynb index 1c516c65..cf58b158 100644 --- a/docs/chunking.ipynb +++ b/docs/chunking.ipynb @@ -30,7 +30,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "96e9149e-fc6d-4048-8e45-a29966e5c6b8", "metadata": { "tags": [] @@ -54,7 +54,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "320224c9-06e2-428a-8614-8ed0d15eee82", "metadata": { "tags": [] @@ -76,542 +76,12 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "id": "f3120040-79f1-4a7f-a61f-afec9fb3ca5b", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "\n", - "
<xarray.Dataset>\n",
-       "Dimensions:  (lat: 25, time: 2920, lon: 53)\n",
-       "Coordinates:\n",
-       "  * lat      (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n",
-       "  * lon      (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n",
-       "  * time     (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n",
-       "Data variables:\n",
-       "    air      (time, lat, lon) float32 dask.array<chunksize=(2920, 5, 10), meta=np.ndarray>\n",
-       "Attributes:\n",
-       "    Conventions:  COARDS\n",
-       "    title:        4x daily NMC reanalysis (1948)\n",
-       "    description:  Data is from NMC initialized reanalysis\\n(4x/day).  These a...\n",
-       "    platform:     Model\n",
-       "    references:   http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly...
" - ], - "text/plain": [ - "\n", - "Dimensions: (lat: 25, time: 2920, lon: 53)\n", - "Coordinates:\n", - " * lat (lat) float32 75.0 72.5 70.0 67.5 65.0 ... 25.0 22.5 20.0 17.5 15.0\n", - " * lon (lon) float32 200.0 202.5 205.0 207.5 ... 322.5 325.0 327.5 330.0\n", - " * time (time) datetime64[ns] 2013-01-01 ... 2014-12-31T18:00:00\n", - "Data variables:\n", - " air (time, lat, lon) float32 dask.array\n", - "Attributes:\n", - " Conventions: COARDS\n", - " title: 4x daily NMC reanalysis (1948)\n", - " description: Data is from NMC initialized reanalysis\\n(4x/day). These a...\n", - " platform: Model\n", - " references: http://www.esrl.noaa.gov/psd/data/gridded/data.ncep.reanaly..." - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "ds" ] @@ -626,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "91343d2a-63ec-4d61-a369-cc99139297e4", "metadata": { "tags": [] @@ -679,13 +149,14 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "7c2ed18f-4dc8-4f5c-88ed-ae5ad41d1647", "metadata": { "tags": [] }, "outputs": [], "source": [ + "%%capture\n", "fn = \"air_bitrounded_by_chunks.zarr\" # Output filename\n", "ds.to_compressed_zarr(fn, compute=False, mode=\"w\") # Creates empty file structure\n", "\n", @@ -718,7 +189,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "id": "d8835a3c-8af0-4423-baf4-84aa9a386f67", "metadata": { "tags": [] @@ -732,23 +203,12 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": null, "id": "c3b5f657-cddd-4476-82a3-c3c2c1a6e7b6", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], + "outputs": [], "source": [ "# # Create a figure and axis and plot the air temperature\n", "fig, ax = plt.subplots(figsize=(12, 6))\n", @@ -826,16 +286,7 @@ "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/tmp/ipykernel_24883/1840452313.py:2: SerializationWarning: saving variable air with floating point data as an integer dtype without any _FillValue to use for NaNs\n", - " ds.to_netcdf(\"0.air_original.nc\")\n" - ] - } - ], + "outputs": [], "source": [ "# Saving the dataset as NetCDF file\n", "ds.to_netcdf(\"0.air_original.nc\")" @@ -851,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "id": "19032fcd-93bc-48b8-ba1f-beba9673491b", "metadata": { "tags": [] @@ -881,6 +332,7 @@ }, "outputs": [], "source": [ + "%%capture\n", "fn = \"air_bitrounded.zarr\" # Output filename\n", "rounded_ds, keepbits = bitrounding(ds)\n", "rounded_ds.to_compressed_zarr(fn, mode=\"w\")" @@ -906,24 +358,12 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": null, "id": "998581b5-6ad9-4f6f-9c61-d0bf1486ec7f", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "7.5M\t0.air_original.nc\n", - "1.7M\t1.air_compressed_all.nc\n", - "1.3M\t2.air_bitrounded_compressed.nc\n", - "776K\t3.air_chunked_bitr_compressed.nc\n", - "1.1M\tair.zarr\n" - ] - } - ], + "outputs": [], "source": [ "!du -hs *.nc *.zarr" ]