diff --git a/mikeio1d/pandas_extension/__init__.py b/mikeio1d/pandas_extension/__init__.py new file mode 100644 index 00000000..36dcff8e --- /dev/null +++ b/mikeio1d/pandas_extension/__init__.py @@ -0,0 +1,13 @@ +from .mikeio1d_accessor import Mikeio1dAccessor # noqa +from .transposed_groupby import TransposedGroupBy # noqa +from .result_reaches_helpers import agg_chainage # noqa +from .result_reaches_helpers import groupby_chainage # noqa +from .various import compact_dataframe + +__all___ = [ + "Mikeio1dAccessor", + "TransposedGroupBy", + "agg_chainage", + "groupby_chainage", + "compact_dataframe", +] diff --git a/mikeio1d/pandas_extension/mikeio1d_accessor.py b/mikeio1d/pandas_extension/mikeio1d_accessor.py new file mode 100644 index 00000000..c75ca925 --- /dev/null +++ b/mikeio1d/pandas_extension/mikeio1d_accessor.py @@ -0,0 +1,80 @@ +import pandas as pd + +from .transposed_groupby import TransposedGroupBy +from .result_reaches_helpers import agg_chainage +from .result_reaches_helpers import groupby_chainage +from .various import compact_dataframe + + +@pd.api.extensions.register_dataframe_accessor("m1d") +class Mikeio1dAccessor: + """ + This class uses Pandas Extension API to register a custom accessor for DataFrames. More + information can be found here: + + https://pandas.pydata.org/docs/development/extending.html#registering-custom-accessors + + The accessor provides convenience methods for working with DataFrames with a MultiIndex. + The intent is as a facade for various helper functions that live elsewhere in the package. + """ + + def __init__(self, pandas_obj): + self._validate(pandas_obj) + self._obj = pandas_obj + + @staticmethod + def _validate(obj): + if not isinstance(obj, pd.DataFrame): + raise AttributeError("Mikeio1dAccessor only supports DataFrames.") + df: pd.DataFrame = obj + if not isinstance(df.columns, pd.MultiIndex): + raise AttributeError("Must have a MultiIndex columns.") + + def _validate_has_chainage(self): + self._validate(self._obj) + if "chainage" not in self._obj.columns.names: + raise ValueError("DataFrame must have chainage column.") + + def agg_chainage(self, agg=None) -> pd.DataFrame: + """ + Convenience wrapper for ResultReaches.agg_chainage. + """ + self._validate_has_chainage() + kwargs = {} + if agg is not None: + kwargs["agg"] = agg + + return agg_chainage(self._obj, **kwargs) + + def groupby_chainage(self, *args, **kwargs) -> TransposedGroupBy: + """ + Convenience wrapper for pd.DataFrame.groupby. The groupby is performed on + the columns of the DataFrame, which are in the form of a MultiIndex. + """ + self._validate_has_chainage() + df: pd.DataFrame = self._obj + return groupby_chainage(df, *args, **kwargs) + + def groupby(self, *args, **kwargs) -> TransposedGroupBy: + """ + Convenience wrapper for pd.DataFrame.groupby. The groupby is performed on + the columns of the DataFrame, which are in the form of a MultiIndex. + """ + df: pd.DataFrame = self._obj + groupby = TransposedGroupBy(transposed_groupby=df.T.groupby(*args, **kwargs)) + return groupby + + def query(self, *args, **kwargs) -> pd.DataFrame: + """ + Convenience wrapper for pd.DataFrame.query. The query is performed on + the columns of the DataFrame, which are in the form of a MultiIndex. + """ + df = self._obj + return df.T.query(*args, **kwargs).T + + def compact(self, *args, **kwargs) -> pd.DataFrame: + """ + Convenience wrapper for compact_dataframe. + """ + df = self._obj + return compact_dataframe(df, *args, **kwargs) diff --git a/mikeio1d/pandas_extension/result_reaches_helpers.py b/mikeio1d/pandas_extension/result_reaches_helpers.py new file mode 100644 index 00000000..e5152b66 --- /dev/null +++ b/mikeio1d/pandas_extension/result_reaches_helpers.py @@ -0,0 +1,42 @@ +import pandas as pd + +from .transposed_groupby import TransposedGroupBy + + +def groupby_chainage(df: pd.DataFrame, **kwargs) -> TransposedGroupBy: + """ + Group results for aggregation along the chainage axis. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with results. Must have hierarchical column index (e.g. column_mode = 'all'). + + Returns + ------- + groupby : TransposedGroupBy + GroupBy object, which can be used for aggregation. + """ + fixed_level_names = [n for n in df.columns.names if n != "chainage"] + groupby = TransposedGroupBy(transposed_groupby=df.T.groupby(fixed_level_names, **kwargs)) + return groupby + + +def agg_chainage(df: pd.DataFrame, agg=["first"], gb_kwargs: dict = {}, **kwargs) -> pd.DataFrame: + """ + Aggregate results along the chainage axis. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with results. Must have hierarchical column index (e.g. column_mode = 'all'). + agg : function, str, list or dict + Aggregation function(s) to apply. Same as pandas.DataFrame.agg. + + Returns + ------- + df : pd.DataFrame + DataFrame with aggregated results. + """ + groupby = groupby_chainage(df, **gb_kwargs) + return groupby.agg(agg, **kwargs) diff --git a/mikeio1d/pandas_extension/transposed_groupby.py b/mikeio1d/pandas_extension/transposed_groupby.py new file mode 100644 index 00000000..a05e9921 --- /dev/null +++ b/mikeio1d/pandas_extension/transposed_groupby.py @@ -0,0 +1,34 @@ +import pandas as pd + + +class TransposedGroupBy: + """ + Same as pandas.DataFrameGroupBy, but returns the transpose of the result. + + Useful where a groupby is performed on a transposed DataFrame, and after + aggregation the result should be transposed back. + + Parameters + ---------- + transposed_groupby : pandas.DataFrameGroupBy + A pandas.DataFrameGroupBy object, which is created from a transposed DataFrame. + + Examples + -------- + >>> df = res.reaches.read(column_mode='all') + >>> groupby = TransposedGroupBy(df.T.groupby(['quantity])) + >>> groupby.agg(['mean', 'max']) + ... # performs agg function, then returns the transpose of the result. + """ + + def __init__(self, transposed_groupby): + self.transposed_groupby = transposed_groupby + + def __getattr__(self, name): + def method(*args, **kwargs): + result = getattr(self.transposed_groupby, name)(*args, **kwargs) + if isinstance(result, pd.DataFrame): + return result.T + return result + + return method diff --git a/mikeio1d/pandas_extension/various.py b/mikeio1d/pandas_extension/various.py new file mode 100644 index 00000000..9848c73c --- /dev/null +++ b/mikeio1d/pandas_extension/various.py @@ -0,0 +1,45 @@ +from __future__ import annotations + +from dataclasses import fields + +import pandas as pd + +from ..quantities import TimeSeriesId + + +def compact_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + Convert a DataFrame with a hierarchical column index to a compact DataFrame. + + A compact DataFrame removes levels where every value matches the TimeSeriesId default value. + + Parameters + ---------- + df : pd.DataFrame + DataFrame with hierarchical column index. + + Returns + ------- + df : pd.DataFrame + Compact DataFrame. + """ + index = df.columns + + is_hierarchical_index = isinstance(index, pd.MultiIndex) + if not is_hierarchical_index: + raise ValueError("DataFrame must have a hierarchical column index to compact.") + + for field in fields(TimeSeriesId): + level_values = index.get_level_values(field.name) + is_only_one_unique_value = len(level_values.unique()) == 1 + if not is_only_one_unique_value: + continue + level_value = level_values[0] + is_all_default_values = (level_value == field.default) or ( + level_value != level_value and field.default != field.default + ) + if is_all_default_values: + index = index.droplevel(field.name) + + df.columns = index + return df diff --git a/mikeio1d/res1d.py b/mikeio1d/res1d.py index 0dbbf963..e2b990b8 100644 --- a/mikeio1d/res1d.py +++ b/mikeio1d/res1d.py @@ -46,6 +46,8 @@ from .quantities import TimeSeriesId +from .pandas_extension import Mikeio1dAccessor # noqa: F401 + from System import DateTime diff --git a/notebooks/mikeio1d_dataframe_examples.ipynb b/notebooks/mikeio1d_dataframe_examples.ipynb new file mode 100644 index 00000000..f0eaa2a1 --- /dev/null +++ b/notebooks/mikeio1d_dataframe_examples.ipynb @@ -0,0 +1,304 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# MIKE IO 1D Pandas DataFrame examples\n", + "\n", + "Results in MIKE IO 1D are fundamentally extracted as Pandas DataFrames. This notebook shows\n", + "specific methods for working with those DataFrames." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from mikeio1d import Res1D\n", + "\n", + "res = Res1D('../tests/testdata/network_river.res1d')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Column modes\n", + "MIKE IO 1D supports different types of column headers. Depending on your use case, one or the other may be preferable. There are four main modes:\n", + "- string (default)\n", + "- all\n", + "- compact\n", + "- timeseries\n", + "\n", + "These are illustrated in the following cells." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# The default column mode concatenates information about the column with a ':' delimiter. \n", + "df = res.reaches.Discharge.read()\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# A hierarchical column mode is also supported. Only relevant levels are included with 'compact'. For full hierarchy use 'all'.\n", + "df = res.reaches.Discharge.read(column_mode='compact')\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# It's also possible to have a TimeSeriesId column index.\n", + "# TimeSeriesId is an object that uniquely identifies each time series.\n", + "df = res.reaches.Discharge.read(column_mode='timeseries')\n", + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MIKE IO 1D extends Pandas by providing a '.m1d' accessor on all DataFrames.\n", + "The accessor provides several useful methods for working with DataFrames within MIKE IO 1D:\n", + "- .m1d.query()\n", + "- .m1d.compact()\n", + "- .m1d.groupby()\n", + "- .m1d.groupby_chainage()\n", + "- .m1d.agg_chainage()\n", + "\n", + "These methods are illustrated below." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# To use the .m1d accessor, the DataFrame must have a MultiIndex column (e.g. column_mode='all' or 'compact').\n", + "# The .m1d accessor exists on the DataFrame itself.\n", + "df = res.read(column_mode='all')\n", + "df.m1d" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### .m1d.compact()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's make some queries on the DataFrame itself with help from the .m1d accessor.\n", + "# Let's read the entire file into a DataFrame with column_mode='all'.\n", + "df = res.read(column_mode='all')\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# That's a lot of detail included and is a bit busy. We can use .m1d.compact() to remove redundant levels.\n", + "df = df.m1d.compact()\n", + "df.head(2)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### .m1d.query()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's get Discharge for the reaches.\n", + "df = df.m1d.query(\"group=='Reach' and quantity=='Discharge'\")\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at all the reaches with 'trib' in their name.\n", + "df = df.m1d.query(\"name.str.contains('trib')\")\n", + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's look at the max discharge for each reach.\n", + "df.agg(['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Use some standard pandas methods to format the table a different way\n", + "# Tip: Chaining methods in brackets is a great way to explore the data. Comment out lines from bottom up to see the effect.\n", + "(\n", + " df.agg(['max'])\n", + " .T\n", + " .droplevel(['group'])\n", + " .unstack()\n", + " #.pipe(lambda df: df * 2) # Uncomment this line to see the effect of the .pipe() method.\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Let's start from scratch and use bracket chaining to create the same table\n", + "df = (\n", + " res.read(column_mode='all')\n", + " .m1d.query(\"name.str.contains('trib')\")\n", + " .m1d.compact()\n", + " .m1d.query(\"quantity=='Discharge'\")\n", + " .agg(['max'])\n", + " .T\n", + " .droplevel('group')\n", + " .unstack()\n", + ")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### .m1d.agg_chainage()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Now let's try something different. We aggregate the max discharge for each reach, then look at descriptive staistics along the time axis.\n", + "# Here, 'count' is the number of time steps and 'mean' is the mean of the max discharges of all Q-points along a reach.\n", + "(\n", + " res.read(column_mode='all')\n", + " .m1d.query(\"quantity=='Discharge'\")\n", + " .m1d.compact()\n", + " .m1d.agg_chainage('max')\n", + " .describe()\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### .m1d.groupby_chainage()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Similarly, let's look at the mean of the first and last Q points.\n", + "(\n", + " res.read(column_mode='all')\n", + " .m1d.query(\"quantity=='Discharge'\")\n", + " .m1d.compact()\n", + " .m1d.groupby_chainage().nth([0,-1]) # First we groupby, then select the first and last chaianges.\n", + " .describe()\n", + " .droplevel(['quantity','group'], axis=1)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### .m1d.groupby()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We can similarly use the .m1d accessor to group by dimensions other than chainage.\n", + "# Below we describe how the global maximum of each quantity in the system varies with time.\n", + "(\n", + " res.read(column_mode='all')\n", + " .m1d.groupby('quantity').agg('max')\n", + " .describe()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/tests/test_pandas_extension.py b/tests/test_pandas_extension.py new file mode 100644 index 00000000..efc857bb --- /dev/null +++ b/tests/test_pandas_extension.py @@ -0,0 +1,90 @@ +import pytest +import pandas as pd +from pandas.testing import assert_frame_equal + +from mikeio1d.pandas_extension import TransposedGroupBy +from mikeio1d.pandas_extension import groupby_chainage +from mikeio1d.pandas_extension import agg_chainage + +from mikeio1d.result_reader_writer.result_reader import ColumnMode +from mikeio1d.pandas_extension import TransposedGroupBy +import pandas as pd +from mikeio1d.pandas_extension import TransposedGroupBy, groupby_chainage, agg_chainage + + +@pytest.fixture +def sample_dataframe(res1d_river_network) -> pd.DataFrame: + df = res1d_river_network.read(column_mode=ColumnMode.COMPACT) + return df + + +def test_groupby_chainage(sample_dataframe): + # Test groupby_chainage function + groupby = groupby_chainage(sample_dataframe) + assert isinstance(groupby, TransposedGroupBy) + assert groupby.max().max().max() == pytest.approx(sample_dataframe.max().max()) + assert groupby.min().min().min() == pytest.approx(sample_dataframe.min().min()) + + +def test_agg_chainage(sample_dataframe): + # Test agg_chainage function + agg_result = agg_chainage(sample_dataframe, agg=["max"]) + assert isinstance(agg_result, pd.DataFrame) + assert agg_result.max().max() == pytest.approx(sample_dataframe.max().max()) + + @pytest.fixture + def sample_dataframe(): + # Create a sample transposed DataFrame + df = pd.DataFrame({"quantity": ["A", "A", "B", "B"], "value": [1, 2, 3, 4]}) + return df + + +def test_transposed_groupby(sample_dataframe): + groupby = sample_dataframe.T.groupby("quantity") + groupby_transposed = TransposedGroupBy(groupby) + assert groupby_transposed.transposed_groupby is groupby + + df_expected = groupby.max().T + df = groupby_transposed.max() + + assert_frame_equal(df, df_expected) + + df_expected = groupby.min().T + df = groupby_transposed.min() + assert_frame_equal(df, df_expected) + + df_expected = groupby.first().T + df = groupby_transposed.first() + assert_frame_equal(df, df_expected) + + +def test_m1d_accessor(sample_dataframe): + assert sample_dataframe.m1d is not None + + +def test_m1d_agg_chainage(sample_dataframe): + df_expected = agg_chainage(sample_dataframe) + df = sample_dataframe.m1d.agg_chainage() + assert_frame_equal(df, df_expected) + + df_expected = agg_chainage(sample_dataframe, agg=["max"]) + df = sample_dataframe.m1d.agg_chainage(agg=["max"]) + assert_frame_equal(df, df_expected) + + +def test_m1d_groupby_chainage(sample_dataframe): + df_expected = groupby_chainage(sample_dataframe).nth(0) + df = sample_dataframe.m1d.groupby_chainage().nth(0) + assert_frame_equal(df, df_expected) + + +def test_m1d_query(sample_dataframe): + df_expected = sample_dataframe.T.query("quantity == 'WaterLevel'").T + df = sample_dataframe.m1d.query("quantity == 'WaterLevel'") + assert_frame_equal(df, df_expected) + + +def test_m1d_groupby(sample_dataframe): + df_expected = sample_dataframe.T.groupby("quantity").max().T + df = sample_dataframe.m1d.groupby("quantity").max() + assert_frame_equal(df, df_expected)