Add summary and interpretations (#28)

* Add plotting utilities * Add summary and interpretations * Update worfklow * Improve summary * Update quarto-publish.yaml
developmentseed · Nov 1, 2024 · 5fa6684 · 5fa6684
1 parent 91d4a34
commit 5fa6684
Show file tree

Hide file tree

Showing 4 changed files with 1,264 additions and 3 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
       - id: mixed-line-ending
 
   - repo: https://github.com/asottile/pyupgrade
-    rev: v3.18.0
+    rev: v3.19.0
     hooks:
       - id: pyupgrade
         args:
@@ -28,7 +28,7 @@ repos:
       - id: blackdoc
 
   - repo: https://github.com/charliermarsh/ruff-pre-commit
-    rev: "v0.6.9"
+    rev: "v0.7.2"
     hooks:
       - id: ruff
         args: ["--fix"]

diff --git a/_quarto.yml b/_quarto.yml
@@ -42,6 +42,10 @@ website:
                 text: NetCDF Driver (Local storage)
               - href: examples/resample-netcdf-rasterio-netcdf-vsis3.ipynb
                 text: NetCDF Driver (S3 storage)
+              - href: examples/resample-weboptimizedzarr-rasterio-zarr-icechunk.ipynb
+                text: Web-Optimized Zarr (S3 storage)
+              - href: examples/resample-cog-rasterio-cog-.ipynb
+                text: Cloud-Optimized GeoTIFF (S3 storage)
           - section: Rioxarray
             contents:
               - href: examples/resample-netcdf-rioxarray-h5netcdf-local.ipynb
@@ -52,6 +56,8 @@ website:
                 text: Zarr Reader + Icechunk virtualization (S3 storage)
               - href: examples/resample-zarr-rioxarray-zarr-icechunk.ipynb
                 text: Zarr Reader (S3 storage)
+              - href: examples/resample-weboptimizedzarr-rioxarray-zarr-icechunk.ipynb
+                text: Web-Optimized Zarr (S3 storage)
           - section: Open Data Cube
             contents:
               - href: examples/resample-netcdf-odc-h5netcdf-local.ipynb
@@ -108,7 +114,8 @@ website:
             text: Memory and time usage (MURSST)
           - href: examples/process-gpm-results.ipynb
             text: Memory and time usage (GPM IMERG)
-
+      - href: examples/summarize-results.ipynb
+        text: Summary and interpretation
 format:
   html:
     theme:

diff --git a/examples/plotting.py b/examples/plotting.py
@@ -0,0 +1,195 @@
+# Load results
+import hvplot.pandas  # noqa
+from utils import process_results
+from typing import Literal
+import pandas as pd
+
+pd.options.mode.chained_assignment = None
+
+df = process_results("results")
+
+
+def subset_dataset(
+    dataset: Literal["mursst", "gpm_imerg"],
+    local: bool = True,
+    format: Literal["netcdf", "zarr"] = "netcdf",
+):
+    subset = df[df["format"] == format]
+    subset = subset[subset["dataset"] == dataset]
+    if local:
+        subset = subset[subset["virtual"] == "local"]
+    else:
+        subset = subset[subset["virtual"] != "local"]
+    return subset.copy()
+
+
+def plot_time(
+    dataset: Literal["mursst", "gpm_imerg"],
+    local: bool,
+    format: Literal["netcdf", "zarr"] = "netcdf",
+):
+    subset = subset_dataset(dataset, local, format)
+    subset = subset.groupby(["zoom", "method"])["duration (s)"].mean(numeric_only=True)
+    if dataset == "mursst":
+        dataset_title = "MUR SST"
+    else:
+        dataset_title = "GPM IMERG"
+    if local:
+        location = "(local file)"
+    else:
+        location = "(remote file on s3)"
+    title = f"Duration for resampling {dataset_title} {location} (s)"
+    plt = subset.hvplot.bar(
+        width=1000,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="Duration (s)",
+        xlabel="Zoom level, Resampling library",
+    )
+    return plt
+
+
+def plot_memory(
+    dataset: Literal["mursst", "gpm_imerg"],
+    local: bool,
+    format: Literal["netcdf", "zarr"] = "netcdf",
+):
+    subset = subset_dataset(dataset, local, format)
+    subset = subset.groupby(["zoom", "method"])["peak memory (GB)"].mean(
+        numeric_only=True
+    )
+    if dataset == "mursst":
+        dataset_title = "MUR SST"
+    else:
+        dataset_title = "GPM IMERG"
+    if local:
+        location = "(local file)"
+    else:
+        location = "(remote file on s3)"
+    title = f"Peak memory allocation for resampling {dataset_title} {location}"
+    plt = subset.hvplot.bar(
+        width=1000,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="'Peak memory (GB)",
+        xlabel="Zoom level, Resampling library",
+    )
+    return plt
+
+
+def plot_time_by_format(dataset: Literal["mursst", "gpm_imerg"], method: str = "odc"):
+    subset = df[df["dataset"] == dataset]
+    subset = subset[subset["virtual"] != "local"]
+    subset = subset[subset["method"] == method]
+    subset["format"] = subset.apply(
+        lambda x: (
+            f"{x['format']} (via icechunk)"
+            if x["virtual"] == "icechunk"
+            else x["format"]
+        ),
+        axis=1,
+    )
+    subset = subset.groupby(["zoom", "format"])["duration (s)"].mean(numeric_only=True)
+    if dataset == "mursst":
+        dataset_title = "MUR SST"
+    else:
+        dataset_title = "GPM IMERG"
+    title = f"Duration for resampling {dataset_title}"
+    plt = subset.hvplot.bar(
+        width=1000,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="Duration (s)",
+        xlabel="Zoom level, Storage format",
+    )
+    return plt
+
+
+def plot_memory_by_format(dataset: Literal["mursst", "gpm_imerg"], method: str = "odc"):
+    subset = df[df["dataset"] == dataset]
+    subset = subset[subset["virtual"] != "local"]
+    subset = subset[subset["method"] == method]
+    subset["format"] = subset.apply(
+        lambda x: (
+            f"{x['format']} (via icechunk)"
+            if x["virtual"] == "icechunk"
+            else x["format"]
+        ),
+        axis=1,
+    )
+    subset = subset.groupby(["zoom", "format"])["peak memory (GB)"].mean(
+        numeric_only=True
+    )
+    if dataset == "mursst":
+        dataset_title = "MUR SST"
+    else:
+        dataset_title = "GPM IMERG"
+    title = f"Peak memory allocation for resampling {dataset_title}"
+    plt = subset.hvplot.bar(
+        width=1000,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="'Peak memory (GB)",
+        xlabel="Zoom level, Resampling library",
+    )
+    return plt
+
+
+def plot_duration_by_weboptimization():
+    subset = df[
+        (df["dataset"] == "mursst")
+        & ((df["method"] == "rasterio") | (df["method"] == "rioxarray"))
+        & (df["driver"] != "h5netcdf")
+        & (df["virtual"] != "local")
+        & (df["format"] != "netcdf")
+    ]
+    subset["format"] = subset["format"].replace(
+        {"cog": "COG", "weboptimizedzarr": "Web-Optimized Zarr", "zarr": "Zarr"}
+    )
+    subset["ID"] = subset.apply(
+        lambda x: f"{x['format']} (resampled with {x['method']})", axis=1
+    )
+    subset = subset.groupby(["zoom", "ID"])["duration (s)"].mean(numeric_only=True)
+    title = "Duration for resampling MUR SST"
+    plt = subset.hvplot.bar(
+        width=1000,
+        height=500,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="Duration (s)",
+        xlabel="Zoom level, Format (resampling method)",
+    )
+    return plt
+
+
+def plot_memory_by_weboptimization():
+    subset = df[
+        (df["dataset"] == "mursst")
+        & ((df["method"] == "rasterio") | (df["method"] == "rioxarray"))
+        & (df["driver"] != "h5netcdf")
+        & (df["virtual"] != "local")
+        & (df["format"] != "netcdf")
+    ]
+    subset["format"] = subset["format"].replace(
+        {"cog": "COG", "weboptimizedzarr": "Web-Optimized Zarr", "zarr": "Zarr"}
+    )
+    subset["ID"] = subset.apply(
+        lambda x: f"{x['format']} (resampled with {x['method']})", axis=1
+    )
+    subset = subset.groupby(["zoom", "ID"])["peak memory (GB)"].mean(numeric_only=True)
+    title = "Peak memory allocation for resampling MUR SST"
+    plt = subset.hvplot.bar(
+        width=1000,
+        height=500,
+        rot=90,
+        color="teal",
+        title=title,
+        ylabel="'Peak memory (GB)",
+        xlabel="Zoom level, Format (resampling method)",
+    )
+    return plt