From db459d13ffb7713850c95dfaab917223da81fb82 Mon Sep 17 00:00:00 2001
From: Gianluca Ficarelli <26835404+GianlucaFicarelli@users.noreply.github.com>
Date: Tue, 30 Apr 2024 11:33:12 +0200
Subject: [PATCH] [NSETM-2311] Add multi_index configuration option (#35)

Used to decide to apply reset_index() to the features dataframes
---
 CHANGELOG.rst                            | 10 ++++++++++
 src/blueetl/config/analysis_model.py     |  1 +
 src/blueetl/features.py                  |  6 +++++-
 src/blueetl/schemas/analysis_config.yaml |  9 +++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b43434d..2c42116 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,6 +1,16 @@
 Changelog
 =========
 
+Version 0.10.0
+--------------
+
+New Features
+~~~~~~~~~~~~
+
+- Add ``multi_index`` option to the features configuration, to decide whether ``reset_index()`` should be applied to the features DataFrames.
+  Note: the features cache will be rebuilt, although the resulting DataFrames are unchanged (because the default value of the new option is ``True``).
+
+
 Version 0.9.1
 -------------
 
diff --git a/src/blueetl/config/analysis_model.py b/src/blueetl/config/analysis_model.py
index dcf30a8..c2ccfb1 100644
--- a/src/blueetl/config/analysis_model.py
+++ b/src/blueetl/config/analysis_model.py
@@ -176,6 +176,7 @@ class FeaturesConfig(BaseModel):
     params_product: dict[str, Any] = {}
     params_zip: dict[str, Any] = {}
     suffix: str = ""
+    multi_index: bool = True
 
 
 class SingleAnalysisConfig(BaseModel):
diff --git a/src/blueetl/features.py b/src/blueetl/features.py
index 9fbf0e6..d81311b 100644
--- a/src/blueetl/features.py
+++ b/src/blueetl/features.py
@@ -431,7 +431,11 @@ def _user_func_wrapper(
         # ignore the index if it's unnamed and with one level; this can be useful
         # for example when the returned DataFrame has a RangeIndex to be dropped
         drop = result_df.index.names == [None]
-        result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop)
+        if features_config.multi_index:
+            result_df = result_df.etl.add_conditions(conditions=key._fields, values=key, drop=drop)
+        else:
+            result_df.reset_index(drop=drop, inplace=True)
+            result_df.etl.insert_columns(loc=0, columns=key._fields, values=key)
         # the conversion to the desired dtype here is important to reduce memory usage and cpu time
         result_df = ensure_dtypes(result_df)
         output_dir = temp_folder / f"{feature_group}{features_config.suffix}"
diff --git a/src/blueetl/schemas/analysis_config.yaml b/src/blueetl/schemas/analysis_config.yaml
index f5bdaba..c57cf9b 100644
--- a/src/blueetl/schemas/analysis_config.yaml
+++ b/src/blueetl/schemas/analysis_config.yaml
@@ -483,6 +483,15 @@ $defs:
           A numeric suffix is automatically added when any of ``params_product`` or ``params_zip`` is specified.
         default: "''"
         type: string
+      multi_index:
+        title: MultiIndex
+        description: |
+          - If True, do not reset the index of the resulting DataFrames of features, and add the values specified in ``groupby`` to the MultiIndex.
+          - If False, reset the index, returning columnar DataFrames.
+
+          The DataFrames with MultiIndex should use less memory then the columnar DataFrames, but they take more time to load and dump to disk.
+        type: boolean
+        default: "true"
     required:
     - type
     - groupby