Skip to content

Commit

Permalink
Add multi_index configuration option
Browse files Browse the repository at this point in the history
Used to decide to apply reset_index() to the features dataframes
  • Loading branch information
GianlucaFicarelli committed Apr 19, 2024
1 parent e6dfe33 commit b688d88
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/blueetl/config/analysis_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@ class FeaturesConfig(BaseModel):
params_product: dict[str, Any] = {}
params_zip: dict[str, Any] = {}
suffix: str = ""
multi_index: bool = True


class SingleAnalysisConfig(BaseModel):
Expand Down
6 changes: 5 additions & 1 deletion src/blueetl/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,11 @@ def _func_wrapper(
# ignore the index if it's unnamed and with one level; this can be useful
# for example when the returned DataFrame has a RangeIndex to be dropped
drop = result_df.index.names == [None]
result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop)
if features_config.multi_index:
result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop)
else:
result_df.reset_index(drop=drop, inplace=True)
result_df.etl.insert_columns(loc=0, columns=key._fields, values=key)
features_records[feature_group + features_config.suffix] = result_df
return features_records

Expand Down
9 changes: 9 additions & 0 deletions src/blueetl/schemas/analysis_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -444,6 +444,15 @@ $defs:
A numeric suffix is automatically added when any of ``params_product`` or ``params_zip`` is specified.
default: "''"
type: string
multi_index:
title: MultiIndex
description: |
- If True, do not reset the index of the resulting DataFrames of features, and add the values specified in ``groupby`` to the MultiIndex.
- If False, reset the index, returning columnar DataFrames.
The DataFrames with MultiIndex should use less memory then the columnar DataFrames, but they take more time to load and dump to disk.
type: boolean
default: "true"
required:
- type
- groupby
Expand Down

0 comments on commit b688d88

Please sign in to comment.