From ccb3e7caef25ee72fc749cbdbb122de8df5366e5 Mon Sep 17 00:00:00 2001 From: Wes Warriner Date: Thu, 14 Nov 2024 19:22:08 -0800 Subject: [PATCH] rtc: replace lazy arg with more specific backend arg to DataLoader and friends; make backend actually backend, for the sake of consistency, by separating logic for polars_lazy vs polars_eager --- src/onemod/fsutils/config_loader.py | 3 +- src/onemod/fsutils/data_loader.py | 57 +++++++++++++++++++---------- src/onemod/fsutils/interface.py | 13 +++---- src/onemod/stage/base.py | 2 +- 4 files changed, 46 insertions(+), 29 deletions(-) diff --git a/src/onemod/fsutils/config_loader.py b/src/onemod/fsutils/config_loader.py index 2659ee6a..3c41fcf3 100644 --- a/src/onemod/fsutils/config_loader.py +++ b/src/onemod/fsutils/config_loader.py @@ -24,5 +24,4 @@ def dump(self, obj: Any, path: Path, **options) -> None: self.io_dict[path.suffix].dump(obj, path, **options) def __repr__(self) -> str: - base_repr = super().__repr__() - return f"{base_repr[:-1]}, handles_configs=True\n)" + return f"{type(self).__name__}()" diff --git a/src/onemod/fsutils/data_loader.py b/src/onemod/fsutils/data_loader.py index fdada5e7..8ea69e56 100644 --- a/src/onemod/fsutils/data_loader.py +++ b/src/onemod/fsutils/data_loader.py @@ -16,49 +16,69 @@ class DataLoader: def load( self, path: Path, - lazy: bool = False, - backend: Literal["polars", "pandas"] = "polars", + backend: Literal[ + "polars", "polars_lazy", "polars_eager", "pandas" + ] = "polars", columns: list[str] | None = None, id_subsets: dict[str, list] | None = None, **options, ) -> pl.DataFrame | pl.LazyFrame | pd.DataFrame: """Load data with optional lazy loading and subset filtering. Supports both Polars and Pandas backends.""" - if lazy and backend == "pandas": - raise ValueError("Pandas backend does not support lazy loading") - if path.suffix not in self.io_dict: raise ValueError(f"Unsupported data format for '{path.suffix}'") - if backend == "polars": - lf = self.io_dict[path.suffix].load_lazy(path, **options) + if backend in ["polars", "polars_eager"]: + polars_df = self.io_dict[path.suffix].load_eager( + path, backend="polars", **options + ) + + if not isinstance(polars_df, pl.DataFrame): + raise TypeError( + f"Expected a Polars DataFrame, got {type(polars_df)}" + ) + + if columns: + polars_df = polars_df.select(columns) + + if id_subsets: + for col, values in id_subsets.items(): + polars_df = polars_df.filter(pl.col(col).is_in(values)) + + return polars_df + elif backend == "polars_lazy": + polars_lf = self.io_dict[path.suffix].load_lazy(path, **options) if columns: - lf = lf.select(columns) + polars_lf = polars_lf.select(columns) if id_subsets: for col, values in id_subsets.items(): - lf = lf.filter(pl.col(col).is_in(values)) + polars_lf = polars_lf.filter(pl.col(col).is_in(values)) - return lf if lazy else lf.collect() + return polars_lf elif backend == "pandas": - df = self.io_dict[path.suffix].load_eager( + pandas_df = self.io_dict[path.suffix].load_eager( path, backend="pandas", **options ) - if not isinstance(df, pd.DataFrame): - raise TypeError(f"Expected a Pandas DataFrame, got {type(df)}") + if not isinstance(pandas_df, pd.DataFrame): + raise TypeError( + f"Expected a Pandas DataFrame, got {type(pandas_df)}" + ) if columns: - df = df[columns] + pandas_df = pandas_df[columns] if id_subsets: for col, values in id_subsets.items(): - df = df[df[col].isin(values)] + pandas_df = pandas_df[pandas_df[col].isin(values)] - return df + return pandas_df else: - raise ValueError("Backend must be either 'polars' or 'pandas'") + raise ValueError( + "Backend must be one of 'polars', 'polars_lazy', 'polars_eager', or 'pandas'" + ) def dump( self, @@ -73,5 +93,4 @@ def dump( self.io_dict[path.suffix].dump(obj, path, **options) def __repr__(self) -> str: - base_repr = super().__repr__() - return f"{base_repr[:-1]}, supports_lazy_loading=True\n)" + return f"{type(self).__name__}()" diff --git a/src/onemod/fsutils/interface.py b/src/onemod/fsutils/interface.py index b009a161..61e4bc1c 100644 --- a/src/onemod/fsutils/interface.py +++ b/src/onemod/fsutils/interface.py @@ -20,8 +20,9 @@ def load( self, *fparts: str, key: str, - lazy: bool = False, - backend: Literal["polars", "pandas"] = "polars", + backend: Literal[ + "polars", "polars_lazy", "polars_eager", "pandas" + ] = "polars", columns: list[str] | None = None, id_subsets: dict[str, list] | None = None, **options, @@ -30,10 +31,9 @@ def load( Parameters ---------- - lazy : bool, optional - Whether to load the data file lazily, applicable only for data files. - backend : {'polars', 'pandas'}, optional - Backend for loading data files, applicable only for data files. + backend : {'polars', 'polars_eager', 'polars_lazy', 'pandas'}, optional + Backend for loading data files. 'polars' is an alias for 'polars_eager'. + 'polars' is the default. columns : list of str, optional Specific columns to load, applicable only for data files. id_subsets : dict, optional @@ -52,7 +52,6 @@ def load( elif path.suffix in self.data_loader.io_dict.keys(): return self.data_loader.load( path, - lazy=lazy, backend=backend, columns=columns, id_subsets=id_subsets, diff --git a/src/onemod/stage/base.py b/src/onemod/stage/base.py index 0973896a..a0547107 100644 --- a/src/onemod/stage/base.py +++ b/src/onemod/stage/base.py @@ -395,7 +395,7 @@ def create_stage_subsets( key=data_key, columns=list(self.groupby), id_subsets=id_subsets, - lazy=True, + backend="polars_lazy", ) subsets_df = create_subsets(self.groupby, lf)