Skip to content

Commit

Permalink
Add summary and interpretations (#28)
Browse files Browse the repository at this point in the history
* Add plotting utilities

* Add summary and interpretations

* Update worfklow

* Improve summary

* Update quarto-publish.yaml
  • Loading branch information
maxrjones authored Nov 1, 2024
1 parent 91d4a34 commit 5fa6684
Show file tree
Hide file tree
Showing 4 changed files with 1,264 additions and 3 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ repos:
- id: mixed-line-ending

- repo: https://github.com/asottile/pyupgrade
rev: v3.18.0
rev: v3.19.0
hooks:
- id: pyupgrade
args:
Expand All @@ -28,7 +28,7 @@ repos:
- id: blackdoc

- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: "v0.6.9"
rev: "v0.7.2"
hooks:
- id: ruff
args: ["--fix"]
Expand Down
9 changes: 8 additions & 1 deletion _quarto.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ website:
text: NetCDF Driver (Local storage)
- href: examples/resample-netcdf-rasterio-netcdf-vsis3.ipynb
text: NetCDF Driver (S3 storage)
- href: examples/resample-weboptimizedzarr-rasterio-zarr-icechunk.ipynb
text: Web-Optimized Zarr (S3 storage)
- href: examples/resample-cog-rasterio-cog-.ipynb
text: Cloud-Optimized GeoTIFF (S3 storage)
- section: Rioxarray
contents:
- href: examples/resample-netcdf-rioxarray-h5netcdf-local.ipynb
Expand All @@ -52,6 +56,8 @@ website:
text: Zarr Reader + Icechunk virtualization (S3 storage)
- href: examples/resample-zarr-rioxarray-zarr-icechunk.ipynb
text: Zarr Reader (S3 storage)
- href: examples/resample-weboptimizedzarr-rioxarray-zarr-icechunk.ipynb
text: Web-Optimized Zarr (S3 storage)
- section: Open Data Cube
contents:
- href: examples/resample-netcdf-odc-h5netcdf-local.ipynb
Expand Down Expand Up @@ -108,7 +114,8 @@ website:
text: Memory and time usage (MURSST)
- href: examples/process-gpm-results.ipynb
text: Memory and time usage (GPM IMERG)

- href: examples/summarize-results.ipynb
text: Summary and interpretation
format:
html:
theme:
Expand Down
195 changes: 195 additions & 0 deletions examples/plotting.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
# Load results
import hvplot.pandas # noqa
from utils import process_results
from typing import Literal
import pandas as pd

pd.options.mode.chained_assignment = None

df = process_results("results")


def subset_dataset(
dataset: Literal["mursst", "gpm_imerg"],
local: bool = True,
format: Literal["netcdf", "zarr"] = "netcdf",
):
subset = df[df["format"] == format]
subset = subset[subset["dataset"] == dataset]
if local:
subset = subset[subset["virtual"] == "local"]
else:
subset = subset[subset["virtual"] != "local"]
return subset.copy()


def plot_time(
dataset: Literal["mursst", "gpm_imerg"],
local: bool,
format: Literal["netcdf", "zarr"] = "netcdf",
):
subset = subset_dataset(dataset, local, format)
subset = subset.groupby(["zoom", "method"])["duration (s)"].mean(numeric_only=True)
if dataset == "mursst":
dataset_title = "MUR SST"
else:
dataset_title = "GPM IMERG"
if local:
location = "(local file)"
else:
location = "(remote file on s3)"
title = f"Duration for resampling {dataset_title} {location} (s)"
plt = subset.hvplot.bar(
width=1000,
rot=90,
color="teal",
title=title,
ylabel="Duration (s)",
xlabel="Zoom level, Resampling library",
)
return plt


def plot_memory(
dataset: Literal["mursst", "gpm_imerg"],
local: bool,
format: Literal["netcdf", "zarr"] = "netcdf",
):
subset = subset_dataset(dataset, local, format)
subset = subset.groupby(["zoom", "method"])["peak memory (GB)"].mean(
numeric_only=True
)
if dataset == "mursst":
dataset_title = "MUR SST"
else:
dataset_title = "GPM IMERG"
if local:
location = "(local file)"
else:
location = "(remote file on s3)"
title = f"Peak memory allocation for resampling {dataset_title} {location}"
plt = subset.hvplot.bar(
width=1000,
rot=90,
color="teal",
title=title,
ylabel="'Peak memory (GB)",
xlabel="Zoom level, Resampling library",
)
return plt


def plot_time_by_format(dataset: Literal["mursst", "gpm_imerg"], method: str = "odc"):
subset = df[df["dataset"] == dataset]
subset = subset[subset["virtual"] != "local"]
subset = subset[subset["method"] == method]
subset["format"] = subset.apply(
lambda x: (
f"{x['format']} (via icechunk)"
if x["virtual"] == "icechunk"
else x["format"]
),
axis=1,
)
subset = subset.groupby(["zoom", "format"])["duration (s)"].mean(numeric_only=True)
if dataset == "mursst":
dataset_title = "MUR SST"
else:
dataset_title = "GPM IMERG"
title = f"Duration for resampling {dataset_title}"
plt = subset.hvplot.bar(
width=1000,
rot=90,
color="teal",
title=title,
ylabel="Duration (s)",
xlabel="Zoom level, Storage format",
)
return plt


def plot_memory_by_format(dataset: Literal["mursst", "gpm_imerg"], method: str = "odc"):
subset = df[df["dataset"] == dataset]
subset = subset[subset["virtual"] != "local"]
subset = subset[subset["method"] == method]
subset["format"] = subset.apply(
lambda x: (
f"{x['format']} (via icechunk)"
if x["virtual"] == "icechunk"
else x["format"]
),
axis=1,
)
subset = subset.groupby(["zoom", "format"])["peak memory (GB)"].mean(
numeric_only=True
)
if dataset == "mursst":
dataset_title = "MUR SST"
else:
dataset_title = "GPM IMERG"
title = f"Peak memory allocation for resampling {dataset_title}"
plt = subset.hvplot.bar(
width=1000,
rot=90,
color="teal",
title=title,
ylabel="'Peak memory (GB)",
xlabel="Zoom level, Resampling library",
)
return plt


def plot_duration_by_weboptimization():
subset = df[
(df["dataset"] == "mursst")
& ((df["method"] == "rasterio") | (df["method"] == "rioxarray"))
& (df["driver"] != "h5netcdf")
& (df["virtual"] != "local")
& (df["format"] != "netcdf")
]
subset["format"] = subset["format"].replace(
{"cog": "COG", "weboptimizedzarr": "Web-Optimized Zarr", "zarr": "Zarr"}
)
subset["ID"] = subset.apply(
lambda x: f"{x['format']} (resampled with {x['method']})", axis=1
)
subset = subset.groupby(["zoom", "ID"])["duration (s)"].mean(numeric_only=True)
title = "Duration for resampling MUR SST"
plt = subset.hvplot.bar(
width=1000,
height=500,
rot=90,
color="teal",
title=title,
ylabel="Duration (s)",
xlabel="Zoom level, Format (resampling method)",
)
return plt


def plot_memory_by_weboptimization():
subset = df[
(df["dataset"] == "mursst")
& ((df["method"] == "rasterio") | (df["method"] == "rioxarray"))
& (df["driver"] != "h5netcdf")
& (df["virtual"] != "local")
& (df["format"] != "netcdf")
]
subset["format"] = subset["format"].replace(
{"cog": "COG", "weboptimizedzarr": "Web-Optimized Zarr", "zarr": "Zarr"}
)
subset["ID"] = subset.apply(
lambda x: f"{x['format']} (resampled with {x['method']})", axis=1
)
subset = subset.groupby(["zoom", "ID"])["peak memory (GB)"].mean(numeric_only=True)
title = "Peak memory allocation for resampling MUR SST"
plt = subset.hvplot.bar(
width=1000,
height=500,
rot=90,
color="teal",
title=title,
ylabel="'Peak memory (GB)",
xlabel="Zoom level, Format (resampling method)",
)
return plt
Loading

0 comments on commit 5fa6684

Please sign in to comment.