Merge pull request #77 from DHI/feature/pandas_dataframe_accessor_ext…

…ension Feature/pandas dataframe accessor extension
DHI · Jan 24, 2024 · d41a0f4 · d41a0f4
2 parents d4686ba + f5aea7f
commit d41a0f4
Show file tree

Hide file tree

Showing 8 changed files with 610 additions and 0 deletions.
diff --git a/mikeio1d/pandas_extension/__init__.py b/mikeio1d/pandas_extension/__init__.py
@@ -0,0 +1,13 @@
+from .mikeio1d_accessor import Mikeio1dAccessor  # noqa
+from .transposed_groupby import TransposedGroupBy  # noqa
+from .result_reaches_helpers import agg_chainage  # noqa
+from .result_reaches_helpers import groupby_chainage  # noqa
+from .various import compact_dataframe
+
+__all___ = [
+    "Mikeio1dAccessor",
+    "TransposedGroupBy",
+    "agg_chainage",
+    "groupby_chainage",
+    "compact_dataframe",
+]
diff --git a/mikeio1d/pandas_extension/mikeio1d_accessor.py b/mikeio1d/pandas_extension/mikeio1d_accessor.py
@@ -0,0 +1,80 @@
+import pandas as pd
+
+from .transposed_groupby import TransposedGroupBy
+from .result_reaches_helpers import agg_chainage
+from .result_reaches_helpers import groupby_chainage
+from .various import compact_dataframe
+
+
+@pd.api.extensions.register_dataframe_accessor("m1d")
+class Mikeio1dAccessor:
+    """
+    This class uses Pandas Extension API to register a custom accessor for DataFrames. More
+    information can be found here:
+
+    https://pandas.pydata.org/docs/development/extending.html#registering-custom-accessors
+
+    The accessor provides convenience methods for working with DataFrames with a MultiIndex.
+    The intent is as a facade for various helper functions that live elsewhere in the package.
+    """
+
+    def __init__(self, pandas_obj):
+        self._validate(pandas_obj)
+        self._obj = pandas_obj
+
+    @staticmethod
+    def _validate(obj):
+        if not isinstance(obj, pd.DataFrame):
+            raise AttributeError("Mikeio1dAccessor only supports DataFrames.")
+        df: pd.DataFrame = obj
+        if not isinstance(df.columns, pd.MultiIndex):
+            raise AttributeError("Must have a MultiIndex columns.")
+
+    def _validate_has_chainage(self):
+        self._validate(self._obj)
+        if "chainage" not in self._obj.columns.names:
+            raise ValueError("DataFrame must have chainage column.")
+
+    def agg_chainage(self, agg=None) -> pd.DataFrame:
+        """
+        Convenience wrapper for ResultReaches.agg_chainage.
+        """
+        self._validate_has_chainage()
+        kwargs = {}
+        if agg is not None:
+            kwargs["agg"] = agg
+
+        return agg_chainage(self._obj, **kwargs)
+
+    def groupby_chainage(self, *args, **kwargs) -> TransposedGroupBy:
+        """
+        Convenience wrapper for pd.DataFrame.groupby. The groupby is performed on
+        the columns of the DataFrame, which are in the form of a MultiIndex.
+        """
+        self._validate_has_chainage()
+        df: pd.DataFrame = self._obj
+        return groupby_chainage(df, *args, **kwargs)
+
+    def groupby(self, *args, **kwargs) -> TransposedGroupBy:
+        """
+        Convenience wrapper for pd.DataFrame.groupby. The groupby is performed on
+        the columns of the DataFrame, which are in the form of a MultiIndex.
+        """
+        df: pd.DataFrame = self._obj
+        groupby = TransposedGroupBy(transposed_groupby=df.T.groupby(*args, **kwargs))
+        return groupby
+
+    def query(self, *args, **kwargs) -> pd.DataFrame:
+        """
+        Convenience wrapper for pd.DataFrame.query. The query is performed on
+        the columns of the DataFrame, which are in the form of a MultiIndex.
+        """
+        df = self._obj
+        return df.T.query(*args, **kwargs).T
+
+    def compact(self, *args, **kwargs) -> pd.DataFrame:
+        """
+        Convenience wrapper for compact_dataframe.
+        """
+        df = self._obj
+        return compact_dataframe(df, *args, **kwargs)
diff --git a/mikeio1d/pandas_extension/result_reaches_helpers.py b/mikeio1d/pandas_extension/result_reaches_helpers.py
@@ -0,0 +1,42 @@
+import pandas as pd
+
+from .transposed_groupby import TransposedGroupBy
+
+
+def groupby_chainage(df: pd.DataFrame, **kwargs) -> TransposedGroupBy:
+    """
+    Group results for aggregation along the chainage axis.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with results. Must have hierarchical column index (e.g. column_mode = 'all').
+
+    Returns
+    -------
+    groupby : TransposedGroupBy
+        GroupBy object, which can be used for aggregation.
+    """
+    fixed_level_names = [n for n in df.columns.names if n != "chainage"]
+    groupby = TransposedGroupBy(transposed_groupby=df.T.groupby(fixed_level_names, **kwargs))
+    return groupby
+
+
+def agg_chainage(df: pd.DataFrame, agg=["first"], gb_kwargs: dict = {}, **kwargs) -> pd.DataFrame:
+    """
+    Aggregate results along the chainage axis.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with results. Must have hierarchical column index (e.g. column_mode = 'all').
+    agg : function, str, list or dict
+        Aggregation function(s) to apply. Same as pandas.DataFrame.agg.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        DataFrame with aggregated results.
+    """
+    groupby = groupby_chainage(df, **gb_kwargs)
+    return groupby.agg(agg, **kwargs)
diff --git a/mikeio1d/pandas_extension/transposed_groupby.py b/mikeio1d/pandas_extension/transposed_groupby.py
@@ -0,0 +1,34 @@
+import pandas as pd
+
+
+class TransposedGroupBy:
+    """
+    Same as pandas.DataFrameGroupBy, but returns the transpose of the result.
+
+    Useful where a groupby is performed on a transposed DataFrame, and after
+    aggregation the result should be transposed back.
+
+    Parameters
+    ----------
+    transposed_groupby : pandas.DataFrameGroupBy
+        A pandas.DataFrameGroupBy object, which is created from a transposed DataFrame.
+
+    Examples
+    --------
+    >>> df = res.reaches.read(column_mode='all')
+    >>> groupby = TransposedGroupBy(df.T.groupby(['quantity]))
+    >>> groupby.agg(['mean', 'max'])
+    ... # performs agg function, then returns the transpose of the result.
+    """
+
+    def __init__(self, transposed_groupby):
+        self.transposed_groupby = transposed_groupby
+
+    def __getattr__(self, name):
+        def method(*args, **kwargs):
+            result = getattr(self.transposed_groupby, name)(*args, **kwargs)
+            if isinstance(result, pd.DataFrame):
+                return result.T
+            return result
+
+        return method
diff --git a/mikeio1d/pandas_extension/various.py b/mikeio1d/pandas_extension/various.py
@@ -0,0 +1,45 @@
+from __future__ import annotations
+
+from dataclasses import fields
+
+import pandas as pd
+
+from ..quantities import TimeSeriesId
+
+
+def compact_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Convert a DataFrame with a hierarchical column index to a compact DataFrame.
+
+    A compact DataFrame removes levels where every value matches the TimeSeriesId default value.
+
+    Parameters
+    ----------
+    df : pd.DataFrame
+        DataFrame with hierarchical column index.
+
+    Returns
+    -------
+    df : pd.DataFrame
+        Compact DataFrame.
+    """
+    index = df.columns
+
+    is_hierarchical_index = isinstance(index, pd.MultiIndex)
+    if not is_hierarchical_index:
+        raise ValueError("DataFrame must have a hierarchical column index to compact.")
+
+    for field in fields(TimeSeriesId):
+        level_values = index.get_level_values(field.name)
+        is_only_one_unique_value = len(level_values.unique()) == 1
+        if not is_only_one_unique_value:
+            continue
+        level_value = level_values[0]
+        is_all_default_values = (level_value == field.default) or (
+            level_value != level_value and field.default != field.default
+        )
+        if is_all_default_values:
+            index = index.droplevel(field.name)
+
+    df.columns = index
+    return df
diff --git a/mikeio1d/res1d.py b/mikeio1d/res1d.py
@@ -46,6 +46,8 @@
 
 from .quantities import TimeSeriesId
 
+from .pandas_extension import Mikeio1dAccessor  # noqa: F401
+
 from System import DateTime