Skip to content

Commit

Permalink
rtc: replace lazy arg with more specific backend arg to DataLoader an…
Browse files Browse the repository at this point in the history
…d friends; make backend actually backend, for the sake of consistency, by separating logic for polars_lazy vs polars_eager
  • Loading branch information
blsmxiu47 committed Nov 15, 2024
1 parent 32f8842 commit ccb3e7c
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 29 deletions.
3 changes: 1 addition & 2 deletions src/onemod/fsutils/config_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,4 @@ def dump(self, obj: Any, path: Path, **options) -> None:
self.io_dict[path.suffix].dump(obj, path, **options)

def __repr__(self) -> str:
base_repr = super().__repr__()
return f"{base_repr[:-1]}, handles_configs=True\n)"
return f"{type(self).__name__}()"
57 changes: 38 additions & 19 deletions src/onemod/fsutils/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,69 @@ class DataLoader:
def load(
self,
path: Path,
lazy: bool = False,
backend: Literal["polars", "pandas"] = "polars",
backend: Literal[
"polars", "polars_lazy", "polars_eager", "pandas"
] = "polars",
columns: list[str] | None = None,
id_subsets: dict[str, list] | None = None,
**options,
) -> pl.DataFrame | pl.LazyFrame | pd.DataFrame:
"""Load data with optional lazy loading and subset filtering. Supports
both Polars and Pandas backends."""
if lazy and backend == "pandas":
raise ValueError("Pandas backend does not support lazy loading")

if path.suffix not in self.io_dict:
raise ValueError(f"Unsupported data format for '{path.suffix}'")

if backend == "polars":
lf = self.io_dict[path.suffix].load_lazy(path, **options)
if backend in ["polars", "polars_eager"]:
polars_df = self.io_dict[path.suffix].load_eager(
path, backend="polars", **options
)

if not isinstance(polars_df, pl.DataFrame):
raise TypeError(
f"Expected a Polars DataFrame, got {type(polars_df)}"
)

if columns:
polars_df = polars_df.select(columns)

if id_subsets:
for col, values in id_subsets.items():
polars_df = polars_df.filter(pl.col(col).is_in(values))

return polars_df
elif backend == "polars_lazy":
polars_lf = self.io_dict[path.suffix].load_lazy(path, **options)

if columns:
lf = lf.select(columns)
polars_lf = polars_lf.select(columns)

if id_subsets:
for col, values in id_subsets.items():
lf = lf.filter(pl.col(col).is_in(values))
polars_lf = polars_lf.filter(pl.col(col).is_in(values))

return lf if lazy else lf.collect()
return polars_lf
elif backend == "pandas":
df = self.io_dict[path.suffix].load_eager(
pandas_df = self.io_dict[path.suffix].load_eager(
path, backend="pandas", **options
)

if not isinstance(df, pd.DataFrame):
raise TypeError(f"Expected a Pandas DataFrame, got {type(df)}")
if not isinstance(pandas_df, pd.DataFrame):
raise TypeError(
f"Expected a Pandas DataFrame, got {type(pandas_df)}"
)

if columns:
df = df[columns]
pandas_df = pandas_df[columns]

if id_subsets:
for col, values in id_subsets.items():
df = df[df[col].isin(values)]
pandas_df = pandas_df[pandas_df[col].isin(values)]

return df
return pandas_df
else:
raise ValueError("Backend must be either 'polars' or 'pandas'")
raise ValueError(
"Backend must be one of 'polars', 'polars_lazy', 'polars_eager', or 'pandas'"
)

def dump(
self,
Expand All @@ -73,5 +93,4 @@ def dump(
self.io_dict[path.suffix].dump(obj, path, **options)

def __repr__(self) -> str:
base_repr = super().__repr__()
return f"{base_repr[:-1]}, supports_lazy_loading=True\n)"
return f"{type(self).__name__}()"
13 changes: 6 additions & 7 deletions src/onemod/fsutils/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,9 @@ def load(
self,
*fparts: str,
key: str,
lazy: bool = False,
backend: Literal["polars", "pandas"] = "polars",
backend: Literal[
"polars", "polars_lazy", "polars_eager", "pandas"
] = "polars",
columns: list[str] | None = None,
id_subsets: dict[str, list] | None = None,
**options,
Expand All @@ -30,10 +31,9 @@ def load(
Parameters
----------
lazy : bool, optional
Whether to load the data file lazily, applicable only for data files.
backend : {'polars', 'pandas'}, optional
Backend for loading data files, applicable only for data files.
backend : {'polars', 'polars_eager', 'polars_lazy', 'pandas'}, optional
Backend for loading data files. 'polars' is an alias for 'polars_eager'.
'polars' is the default.
columns : list of str, optional
Specific columns to load, applicable only for data files.
id_subsets : dict, optional
Expand All @@ -52,7 +52,6 @@ def load(
elif path.suffix in self.data_loader.io_dict.keys():
return self.data_loader.load(
path,
lazy=lazy,
backend=backend,
columns=columns,
id_subsets=id_subsets,
Expand Down
2 changes: 1 addition & 1 deletion src/onemod/stage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def create_stage_subsets(
key=data_key,
columns=list(self.groupby),
id_subsets=id_subsets,
lazy=True,
backend="polars_lazy",
)

subsets_df = create_subsets(self.groupby, lf)
Expand Down

0 comments on commit ccb3e7c

Please sign in to comment.