From db459d13ffb7713850c95dfaab917223da81fb82 Mon Sep 17 00:00:00 2001 From: Gianluca Ficarelli <26835404+GianlucaFicarelli@users.noreply.github.com> Date: Tue, 30 Apr 2024 11:33:12 +0200 Subject: [PATCH] [NSETM-2311] Add multi_index configuration option (#35) Used to decide to apply reset_index() to the features dataframes --- CHANGELOG.rst | 10 ++++++++++ src/blueetl/config/analysis_model.py | 1 + src/blueetl/features.py | 6 +++++- src/blueetl/schemas/analysis_config.yaml | 9 +++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index b43434d..2c42116 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,6 +1,16 @@ Changelog ========= +Version 0.10.0 +-------------- + +New Features +~~~~~~~~~~~~ + +- Add ``multi_index`` option to the features configuration, to decide whether ``reset_index()`` should be applied to the features DataFrames. + Note: the features cache will be rebuilt, although the resulting DataFrames are unchanged (because the default value of the new option is ``True``). + + Version 0.9.1 ------------- diff --git a/src/blueetl/config/analysis_model.py b/src/blueetl/config/analysis_model.py index dcf30a8..c2ccfb1 100644 --- a/src/blueetl/config/analysis_model.py +++ b/src/blueetl/config/analysis_model.py @@ -176,6 +176,7 @@ class FeaturesConfig(BaseModel): params_product: dict[str, Any] = {} params_zip: dict[str, Any] = {} suffix: str = "" + multi_index: bool = True class SingleAnalysisConfig(BaseModel): diff --git a/src/blueetl/features.py b/src/blueetl/features.py index 9fbf0e6..d81311b 100644 --- a/src/blueetl/features.py +++ b/src/blueetl/features.py @@ -431,7 +431,11 @@ def _user_func_wrapper( # ignore the index if it's unnamed and with one level; this can be useful # for example when the returned DataFrame has a RangeIndex to be dropped drop = result_df.index.names == [None] - result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop) + if features_config.multi_index: + result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop) + else: + result_df.reset_index(drop=drop, inplace=True) + result_df.etl.insert_columns(loc=0, columns=key._fields, values=key) # the conversion to the desired dtype here is important to reduce memory usage and cpu time result_df = ensure_dtypes(result_df) output_dir = temp_folder / f"{feature_group}{features_config.suffix}" diff --git a/src/blueetl/schemas/analysis_config.yaml b/src/blueetl/schemas/analysis_config.yaml index f5bdaba..c57cf9b 100644 --- a/src/blueetl/schemas/analysis_config.yaml +++ b/src/blueetl/schemas/analysis_config.yaml @@ -483,6 +483,15 @@ $defs: A numeric suffix is automatically added when any of ``params_product`` or ``params_zip`` is specified. default: "''" type: string + multi_index: + title: MultiIndex + description: | + - If True, do not reset the index of the resulting DataFrames of features, and add the values specified in ``groupby`` to the MultiIndex. + - If False, reset the index, returning columnar DataFrames. + + The DataFrames with MultiIndex should use less memory then the columnar DataFrames, but they take more time to load and dump to disk. + type: boolean + default: "true" required: - type - groupby