Skip to content

Commit

Permalink
rtc: replace backend with return_type for data loading; always use la…
Browse files Browse the repository at this point in the history
…zy loading at first and any column selection, id subsetting requested to be dnoe within load() now uses lazyapi
  • Loading branch information
blsmxiu47 committed Nov 20, 2024
1 parent ccb3e7c commit 950a088
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 57 deletions.
67 changes: 18 additions & 49 deletions src/onemod/fsutils/data_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,68 +16,37 @@ class DataLoader:
def load(
self,
path: Path,
backend: Literal[
"polars", "polars_lazy", "polars_eager", "pandas"
] = "polars",
return_type: Literal[
"polars_dataframe", "polars_lazyframe", "pandas_dataframe"
] = "polars_dataframe",
columns: list[str] | None = None,
id_subsets: dict[str, list] | None = None,
**options,
) -> pl.DataFrame | pl.LazyFrame | pd.DataFrame:
"""Load data with optional lazy loading and subset filtering. Supports
both Polars and Pandas backends."""
"""Load data with lazy loading and subset filtering. Polars and
Pandas options available for the type of the returned data object."""

if path.suffix not in self.io_dict:
raise ValueError(f"Unsupported data format for '{path.suffix}'")

if backend in ["polars", "polars_eager"]:
polars_df = self.io_dict[path.suffix].load_eager(
path, backend="polars", **options
)

if not isinstance(polars_df, pl.DataFrame):
raise TypeError(
f"Expected a Polars DataFrame, got {type(polars_df)}"
)

if columns:
polars_df = polars_df.select(columns)

if id_subsets:
for col, values in id_subsets.items():
polars_df = polars_df.filter(pl.col(col).is_in(values))

return polars_df
elif backend == "polars_lazy":
polars_lf = self.io_dict[path.suffix].load_lazy(path, **options)
polars_lf = self.io_dict[path.suffix].load_lazy(path, **options)

if columns:
polars_lf = polars_lf.select(columns)
if columns:
polars_lf = polars_lf.select(columns)

if id_subsets:
for col, values in id_subsets.items():
polars_lf = polars_lf.filter(pl.col(col).is_in(values))
if id_subsets:
for col, values in id_subsets.items():
polars_lf = polars_lf.filter(pl.col(col).is_in(values))

if return_type == "polars_dataframe":
return polars_lf.collect()
elif return_type == "polars_lazyframe":
return polars_lf
elif backend == "pandas":
pandas_df = self.io_dict[path.suffix].load_eager(
path, backend="pandas", **options
)

if not isinstance(pandas_df, pd.DataFrame):
raise TypeError(
f"Expected a Pandas DataFrame, got {type(pandas_df)}"
)

if columns:
pandas_df = pandas_df[columns]

if id_subsets:
for col, values in id_subsets.items():
pandas_df = pandas_df[pandas_df[col].isin(values)]

return pandas_df
elif return_type == "pandas_dataframe":
return polars_lf.collect().to_pandas()
else:
raise ValueError(
"Backend must be one of 'polars', 'polars_lazy', 'polars_eager', or 'pandas'"
"Return type must be one of 'polars_dataframe', 'polars_lazyframe', or 'pandas_dataframe'"
)

def dump(
Expand Down
13 changes: 6 additions & 7 deletions src/onemod/fsutils/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ def load(
self,
*fparts: str,
key: str,
backend: Literal[
"polars", "polars_lazy", "polars_eager", "pandas"
] = "polars",
return_type: Literal[
"polars_dataframe", "polars_lazyframe", "pandas_dataframe"
] = "polars_dataframe",
columns: list[str] | None = None,
id_subsets: dict[str, list] | None = None,
**options,
Expand All @@ -31,9 +31,8 @@ def load(
Parameters
----------
backend : {'polars', 'polars_eager', 'polars_lazy', 'pandas'}, optional
Backend for loading data files. 'polars' is an alias for 'polars_eager'.
'polars' is the default.
return_type : {'polars_dataframe', 'polars_lazyframe', 'pandas_dataframe'}, optional
Return type of loaded data object, applicable only for data files.
columns : list of str, optional
Specific columns to load, applicable only for data files.
id_subsets : dict, optional
Expand All @@ -52,7 +51,7 @@ def load(
elif path.suffix in self.data_loader.io_dict.keys():
return self.data_loader.load(
path,
backend=backend,
return_type=return_type,
columns=columns,
id_subsets=id_subsets,
**options,
Expand Down
2 changes: 1 addition & 1 deletion src/onemod/stage/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ def create_stage_subsets(
key=data_key,
columns=list(self.groupby),
id_subsets=id_subsets,
backend="polars_lazy",
return_type="polars_lazyframe",
)

subsets_df = create_subsets(self.groupby, lf)
Expand Down

0 comments on commit 950a088

Please sign in to comment.