From 2a65660d6cf9613482004562bba43ea02c317097 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:41:35 +0100
Subject: [PATCH 01/24] adapt embedding layer to new input format of tuple
 information

---
 .../arch_utils/layer_utils/embedding_layer.py | 57 ++++++++++++-------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 0fb93fd..0184ca3 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -6,7 +6,7 @@
 
 
 class EmbeddingLayer(nn.Module):
-    def __init__(self, num_feature_info, cat_feature_info, config):
+    def __init__(self, num_feature_info, cat_feature_info, emb_feature_info, config):
         """Embedding layer that handles numerical and categorical embeddings.
 
         Parameters
@@ -28,6 +28,7 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         self.layer_norm_after_embedding = getattr(
             config, "layer_norm_after_embedding", False
         )
+        self.embedding_projection = getattr(config, "embedding_projection", True)
         self.use_cls = getattr(config, "use_cls", False)
         self.cls_position = getattr(config, "cls_position", 0)
         self.embedding_dropout = (
@@ -100,6 +101,21 @@ def __init__(self, num_feature_info, cat_feature_info, config):
             ]
         )
 
+        if self.embedding_projection:
+            self.emb_embeddings = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Linear(
+                            feature_info["dimension"],
+                            self.d_model,
+                            bias=self.embedding_bias,
+                        ),
+                        self.embedding_activation,
+                    )
+                    for feature_name, feature_info in emb_feature_info.items()
+                ]
+            )
+
         # Class token if required
         if self.use_cls:
             self.cls_token = nn.Parameter(torch.zeros(1, 1, self.d_model))
@@ -108,15 +124,12 @@ def __init__(self, num_feature_info, cat_feature_info, config):
         if self.layer_norm_after_embedding:
             self.embedding_norm = nn.LayerNorm(self.d_model)
 
-    def forward(self, num_features=None, cat_features=None):
+    def forward(self, num_features, cat_features, emb_features):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor, optional
-            Tensor containing the numerical features.
-        cat_features : Tensor, optional
-            Tensor containing the categorical features.
+        data: tuple of lists of tensors
 
         Returns
         -------
@@ -128,6 +141,7 @@ def forward(self, num_features=None, cat_features=None):
         ValueError
             If no features are provided to the model.
         """
+        num_embeddings, cat_embeddings, emb_embeddings = None, None, None
 
         # Class token initialization
         if self.use_cls:
@@ -147,8 +161,6 @@ def forward(self, num_features=None, cat_features=None):
             cat_embeddings = torch.squeeze(cat_embeddings, dim=2)
             if self.layer_norm_after_embedding:
                 cat_embeddings = self.embedding_norm(cat_embeddings)
-        else:
-            cat_embeddings = None
 
         # Process numerical embeddings based on embedding_type
         if self.embedding_type == "plr":
@@ -161,8 +173,6 @@ def forward(self, num_features=None, cat_features=None):
                 num_embeddings = self.num_embeddings(num_features)
                 if self.layer_norm_after_embedding:
                     num_embeddings = self.embedding_norm(num_embeddings)
-            else:
-                num_embeddings = None
         else:
             # For linear and ndt embeddings, handle each feature individually
             if self.num_embeddings and num_features is not None:
@@ -170,16 +180,23 @@ def forward(self, num_features=None, cat_features=None):
                 num_embeddings = torch.stack(num_embeddings, dim=1)
                 if self.layer_norm_after_embedding:
                     num_embeddings = self.embedding_norm(num_embeddings)
-            else:
-                num_embeddings = None
-
-        # Combine categorical and numerical embeddings
-        if cat_embeddings is not None and num_embeddings is not None:
-            x = torch.cat([cat_embeddings, num_embeddings], dim=1)
-        elif cat_embeddings is not None:
-            x = cat_embeddings
-        elif num_embeddings is not None:
-            x = num_embeddings
+
+        if self.embedding_projection:
+            emb_embeddings = [
+                emb(emb_features[i]) for i, emb in enumerate(self.emb_embeddings)
+            ]
+            emb_embeddings = torch.stack(emb_embeddings, dim=1)
+        else:
+            emb_embeddings = torch.stack(emb_features, dim=1)
+        if self.layer_norm_after_embedding:
+            emb_embeddings = self.embedding_norm(emb_embeddings)
+
+        embeddings = [
+            e for e in [cat_embeddings, num_embeddings, emb_embeddings] if e is not None
+        ]
+
+        if embeddings:
+            x = torch.cat(embeddings, dim=1) if len(embeddings) > 1 else embeddings[0]
         else:
             raise ValueError("No features provided to the model.")
 

From 4d5f94a9ce5cf568029bbce8607ad74ad5a55269 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:41:53 +0100
Subject: [PATCH 02/24] adapt basemodel encoding function to tuple input

---
 mambular/base_models/basemodel.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mambular/base_models/basemodel.py b/mambular/base_models/basemodel.py
index fd21852..49b56cd 100644
--- a/mambular/base_models/basemodel.py
+++ b/mambular/base_models/basemodel.py
@@ -223,7 +223,7 @@ def pool_sequence(self, out):
         else:
             raise ValueError(f"Invalid pooling method: {self.hparams.pooling_method}")
 
-    def encode(self, num_features, cat_features):
+    def encode(self, data):
         if not hasattr(self, "embedding_layer"):
             raise ValueError("The model does not have an embedding layer")
 
@@ -237,7 +237,7 @@ def encode(self, num_features, cat_features):
             raise ValueError("The model does not generate contextualized embeddings")
 
         # Get the actual layer and call it
-        x = self.embedding_layer(num_features=num_features, cat_features=cat_features)
+        x = self.embedding_layer(*data)
 
         if getattr(self.hparams, "shuffle_embeddings", False):
             x = x[:, self.perm, :]

From adc6d191c04a9f025aa36fc7fb4a3060a53bd7dd Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:42:08 +0100
Subject: [PATCH 03/24] batch now returns tuple and *data is passed to forward
 method

---
 mambular/base_models/lightning_wrapper.py | 45 +++++++++++++----------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/mambular/base_models/lightning_wrapper.py b/mambular/base_models/lightning_wrapper.py
index 1d8530e..f1c836c 100644
--- a/mambular/base_models/lightning_wrapper.py
+++ b/mambular/base_models/lightning_wrapper.py
@@ -30,8 +30,7 @@ def __init__(
         self,
         model_class: type[nn.Module],
         config,
-        cat_feature_info,
-        num_feature_info,
+        feature_information,
         num_classes=1,
         lss=False,
         family=None,
@@ -91,13 +90,12 @@ def __init__(
 
         self.base_model = model_class(
             config=config,
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            feature_information=feature_information,
             num_classes=output_dim,
             **kwargs,
         )
 
-    def forward(self, num_features, cat_features):
+    def forward(self, num_features, cat_features, embeddings):
         """Forward pass through the model.
 
         Parameters
@@ -113,7 +111,7 @@ def forward(self, num_features, cat_features):
             Model output.
         """
 
-        return self.base_model.forward(num_features, cat_features)
+        return self.base_model.forward(num_features, cat_features, embeddings)
 
     def compute_loss(self, predictions, y_true):
         """Compute the loss for the given predictions and true labels.
@@ -145,7 +143,10 @@ def compute_loss(self, predictions, y_true):
                 )
 
         if getattr(self.base_model, "returns_ensemble", False):  # Ensemble case
-            if self.loss_fct.__class__.__name__ == "CrossEntropyLoss" and predictions.dim() == 3:
+            if (
+                self.loss_fct.__class__.__name__ == "CrossEntropyLoss"
+                and predictions.dim() == 3
+            ):
                 # Classification case with ensemble: predictions (N, E, k), y_true (N,)
                 N, E, k = predictions.shape
                 loss = 0.0
@@ -186,18 +187,20 @@ def training_step(self, batch, batch_idx):  # type: ignore
         Tensor
             Training loss.
         """
-        num_features, cat_features, labels = batch
+        data, labels = batch
 
         # Check if the model has a `penalty_forward` method
         if hasattr(self.base_model, "penalty_forward"):
-            preds, penalty = self.base_model.penalty_forward(num_features=num_features, cat_features=cat_features)
+            preds, penalty = self.base_model.penalty_forward(*data)
             loss = self.compute_loss(preds, labels) + penalty
         else:
-            preds = self(num_features=num_features, cat_features=cat_features)
+            preds = self(*data)
             loss = self.compute_loss(preds, labels)
 
         # Log the training loss
-        self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
+        self.log(
+            "train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True
+        )
 
         # Log custom training metrics
         for metric_name, metric_fn in self.train_metrics.items():
@@ -229,8 +232,8 @@ def validation_step(self, batch, batch_idx):  # type: ignore
             Validation loss.
         """
 
-        num_features, cat_features, labels = batch
-        preds = self(num_features=num_features, cat_features=cat_features)
+        data, labels = batch
+        preds = self(*data)
         val_loss = self.compute_loss(preds, labels)
 
         self.log(
@@ -271,8 +274,8 @@ def test_step(self, batch, batch_idx):  # type: ignore
         Tensor
             Test loss.
         """
-        num_features, cat_features, labels = batch
-        preds = self(num_features=num_features, cat_features=cat_features)
+        data, labels = batch
+        preds = self(*data)
         test_loss = self.compute_loss(preds, labels)
 
         self.log(
@@ -302,8 +305,7 @@ def predict_step(self, batch, batch_idx):
             Predictions.
         """
 
-        num_features, cat_features = batch
-        preds = self(num_features=num_features, cat_features=cat_features)
+        preds = self(*batch)
 
         return preds
 
@@ -346,8 +348,13 @@ def on_validation_epoch_end(self):
 
             # Apply pruning logic if needed
             if self.current_epoch >= self.pruning_epoch:
-                if self.early_pruning_threshold is not None and val_loss_value > self.early_pruning_threshold:
-                    print(f"Pruned at epoch {self.current_epoch}, val_loss {val_loss_value}")
+                if (
+                    self.early_pruning_threshold is not None
+                    and val_loss_value > self.early_pruning_threshold
+                ):
+                    print(
+                        f"Pruned at epoch {self.current_epoch}, val_loss {val_loss_value}"
+                    )
                     self.trainer.should_stop = True  # Stop training early
 
     def epoch_val_loss_at(self, epoch):

From a02b9dd7b11e590107a0cba8e031c04969e9a409 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:42:18 +0100
Subject: [PATCH 04/24] first two basemodels adapted to new logic

---
 mambular/base_models/mlp.py       | 33 ++++++++++++++++---------------
 mambular/base_models/tabularnn.py | 15 ++++++--------
 2 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 0c9251f..1a38871 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -5,6 +5,7 @@
 from ..configs.mlp_config import DefaultMLPConfig
 from ..utils.get_feature_dimensions import get_feature_dimensions
 from .basemodel import BaseModel
+import numpy as np
 
 
 class MLP(BaseModel):
@@ -57,31 +58,29 @@ class MLP(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (cat_feature_info, num_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultMLPConfig = DefaultMLPConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         # Initialize layers
         self.layers = nn.ModuleList()
 
-        input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
+        input_dim = get_feature_dimensions(*feature_information)
 
         if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
+                *feature_information,
                 config=config,
             )
-            input_dim = len(num_feature_info) * self.hparams.d_model + len(cat_feature_info) * self.hparams.d_model
+            input_dim = np.sum(
+                [len(info) * self.hparams.d_model for info in feature_information]
+            )
 
         # Input layer
         self.layers.append(nn.Linear(input_dim, self.hparams.layer_sizes[0]))
@@ -97,7 +96,9 @@ def __init__(
 
         # Hidden layers
         for i in range(1, len(self.hparams.layer_sizes)):
-            self.layers.append(nn.Linear(self.hparams.layer_sizes[i - 1], self.hparams.layer_sizes[i]))
+            self.layers.append(
+                nn.Linear(self.hparams.layer_sizes[i - 1], self.hparams.layer_sizes[i])
+            )
             if self.hparams.batch_norm:
                 self.layers.append(nn.BatchNorm1d(self.hparams.layer_sizes[i]))
             if self.hparams.layer_norm:
@@ -112,26 +113,26 @@ def __init__(
         # Output layer
         self.layers.append(nn.Linear(self.hparams.layer_sizes[-1], num_classes))
 
-    def forward(self, num_features, cat_features) -> torch.Tensor:
+    def forward(self, *data) -> torch.Tensor:
         """Forward pass of the MLP model.
 
         Parameters
         ----------
-        x : torch.Tensor
-            Input tensor.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         torch.Tensor
             Output tensor.
         """
+
         if self.hparams.use_embeddings:
-            x = self.embedding_layer(num_features, cat_features)
+            x = self.embedding_layer(*data)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
         else:
-            x = num_features + cat_features
-            x = torch.cat(x, dim=1)
+            x = torch.cat([t for tensors in data for t in tensors], dim=1)
 
         for i in range(len(self.layers) - 1):
             if isinstance(self.layers[i], nn.Linear):
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index d4824e9..5699bf7 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -12,10 +12,10 @@
 
 
 class TabulaRNN(BaseModel):
+
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (cat_feature_info, num_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultTabulaRNNConfig = DefaultTabulaRNNConfig(),  # noqa: B008
         **kwargs,
@@ -24,14 +24,11 @@ def __init__(
         self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
 
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         self.rnn = ConvRNN(config)
 
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *feature_information,
             config=config,
         )
 
@@ -50,10 +47,10 @@ def __init__(
         self.norm_f = get_normalization_layer(temp_config)
 
         # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        n_inputs = [len(info) for info in feature_information]
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
@@ -69,7 +66,7 @@ def forward(self, num_features, cat_features):
             The output predictions of the model.
         """
 
-        x = self.embedding_layer(num_features, cat_features)
+        x = self.embedding_layer(*data)
         # RNN forward pass
         out, _ = self.rnn(x)
         z = self.linear(torch.mean(x, dim=1))

From 10d1c00487474c023ed7e308dfc4d02e4282b68f Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:42:41 +0100
Subject: [PATCH 05/24] major changes in handling embeddings as array/list
 inputs in addition to tabular data

---
 mambular/data_utils/datamodule.py | 123 +++++++++++++++++++++++++-----
 1 file changed, 103 insertions(+), 20 deletions(-)

diff --git a/mambular/data_utils/datamodule.py b/mambular/data_utils/datamodule.py
index b6bfb32..459b8c1 100644
--- a/mambular/data_utils/datamodule.py
+++ b/mambular/data_utils/datamodule.py
@@ -78,6 +78,8 @@ def __init__(
         # Initialize placeholders for data
         self.X_train = None
         self.y_train = None
+        self.embeddings_train = None
+        self.embeddings_val = None
         self.test_preprocessor_fitted = False
         self.dataloader_kwargs = dataloader_kwargs
 
@@ -87,6 +89,8 @@ def preprocess_data(
         y_train,
         X_val=None,
         y_val=None,
+        embeddings_train=None,
+        embeddings_val=None,
         val_size=0.2,
         random_state=101,
     ):
@@ -98,10 +102,14 @@ def preprocess_data(
             Training feature set.
         y_train : array-like, shape (n_samples_train,)
             Training target values.
+        embeddings_train : array-like or list of array-like, optional
+            Training embeddings if available.
         X_val : DataFrame or array-like, shape (n_samples_val, n_features), optional
             Validation feature set. If None, a validation set will be created from `X_train`.
         y_val : array-like, shape (n_samples_val,), optional
             Validation target values. If None, a validation set will be created from `y_train`.
+        embeddings_val : array-like or list of array-like, optional
+            Validation embeddings if available.
         val_size : float, optional
             Proportion of data to include in the validation split if `X_val` and `y_val` are None.
         random_state : int, optional
@@ -113,41 +121,85 @@ def preprocess_data(
         """
 
         if X_val is None or y_val is None:
-            self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
-                X_train, y_train, test_size=val_size, random_state=random_state
-            )
+            split_data = [X_train, y_train]
+
+            if embeddings_train is not None:
+                if not isinstance(embeddings_train, list):
+                    embeddings_train = [embeddings_train]
+                if embeddings_val is not None and not isinstance(embeddings_val, list):
+                    embeddings_val = [embeddings_val]
+
+                split_data += embeddings_train
+                split_result = train_test_split(
+                    *split_data, test_size=val_size, random_state=random_state
+                )
+
+                self.X_train, self.X_val, self.y_train, self.y_val = split_result[:4]
+                self.embeddings_train = split_result[4::2]
+                self.embeddings_val = split_result[5::2]
+            else:
+                self.X_train, self.X_val, self.y_train, self.y_val = train_test_split(
+                    *split_data, test_size=val_size, random_state=random_state
+                )
+                self.embeddings_train = None
+                self.embeddings_val = None
         else:
             self.X_train = X_train
             self.y_train = y_train
             self.X_val = X_val
             self.y_val = y_val
 
+            if embeddings_train is not None and embeddings_val is not None:
+                if not isinstance(embeddings_train, list):
+                    embeddings_train = [embeddings_train]
+                if not isinstance(embeddings_val, list):
+                    embeddings_val = [embeddings_val]
+                self.embeddings_train = embeddings_train
+                self.embeddings_val = embeddings_val
+            else:
+                self.embeddings_train = None
+                self.embeddings_val = None
+
         # Fit the preprocessor on the combined training and validation data
         combined_X = pd.concat([self.X_train, self.X_val], axis=0).reset_index(
             drop=True
         )
         combined_y = np.concatenate((self.y_train, self.y_val), axis=0)
 
-        # Fit the preprocessor
-        self.preprocessor.fit(combined_X, combined_y)
+        if self.embeddings_train is not None and self.embeddings_val is not None:
+            combined_embeddings = [
+                np.concatenate((emb_train, emb_val), axis=0)
+                for emb_train, emb_val in zip(
+                    self.embeddings_train, self.embeddings_val
+                )
+            ]
+        else:
+            combined_embeddings = None
+
+        self.preprocessor.fit(combined_X, combined_y, combined_embeddings)
 
         # Update feature info based on the actual processed data
-        (
-            self.num_feature_info,
-            self.cat_feature_info,
-        ) = self.preprocessor.get_feature_info()
+        (self.num_feature_info, self.cat_feature_info, self.embedding_feature_info) = (
+            self.preprocessor.get_feature_info()
+        )
 
     def setup(self, stage: str):
         """Transform the data and create DataLoaders."""
         if stage == "fit":
-            train_preprocessed_data = self.preprocessor.transform(self.X_train)
-            val_preprocessed_data = self.preprocessor.transform(self.X_val)
+            train_preprocessed_data = self.preprocessor.transform(
+                self.X_train, self.embeddings_train
+            )
+            val_preprocessed_data = self.preprocessor.transform(
+                self.X_val, self.embeddings_val
+            )
 
             # Initialize lists for tensors
             train_cat_tensors = []
             train_num_tensors = []
+            train_emb_tensors = []
             val_cat_tensors = []
             val_num_tensors = []
+            val_emb_tensors = []
 
             # Populate tensors for categorical features, if present in processed data
             for key in self.cat_feature_info:  # type: ignore
@@ -201,6 +253,21 @@ def setup(self, stage: str):
                         )
                     )
 
+            if self.embedding_feature_info is not None:
+                for key in self.embedding_feature_info:
+                    if key in train_preprocessed_data:
+                        train_emb_tensors.append(
+                            torch.tensor(
+                                train_preprocessed_data[key], dtype=torch.float32
+                            )
+                        )
+                    if key in val_preprocessed_data:
+                        val_emb_tensors.append(
+                            torch.tensor(
+                                val_preprocessed_data[key], dtype=torch.float32
+                            )
+                        )
+
             train_labels = torch.tensor(
                 self.y_train, dtype=self.labels_dtype
             ).unsqueeze(dim=1)
@@ -208,21 +275,26 @@ def setup(self, stage: str):
                 dim=1
             )
 
-            # Create datasets
             self.train_dataset = MambularDataset(
                 train_cat_tensors,
                 train_num_tensors,
+                train_emb_tensors,
                 train_labels,
                 regression=self.regression,
             )
             self.val_dataset = MambularDataset(
-                val_cat_tensors, val_num_tensors, val_labels, regression=self.regression
+                val_cat_tensors,
+                val_num_tensors,
+                val_emb_tensors,
+                val_labels,
+                regression=self.regression,
             )
 
-    def preprocess_new_data(self, X):
+    def preprocess_new_data(self, X, embeddings):
         cat_tensors = []
         num_tensors = []
-        preprocessed_data = self.preprocessor.transform(X)
+        emb_tensors = []
+        preprocessed_data = self.preprocessor.transform(X, embeddings)
 
         # Populate tensors for categorical features, if present in processed data
         for key in self.cat_feature_info:  # type: ignore
@@ -254,15 +326,26 @@ def preprocess_new_data(self, X):
                     torch.tensor(preprocessed_data[num_key], dtype=torch.float32)
                 )
 
+        if self.embedding_feature_info is not None:
+            for key in self.embedding_feature_info:
+                if key in preprocessed_data:
+                    emb_tensors.append(
+                        torch.tensor(preprocessed_data[key], dtype=torch.float32)
+                    )
+
         return MambularDataset(
-            cat_tensors, num_tensors, labels=None, regression=self.regression
+            cat_tensors,
+            num_tensors,
+            emb_tensors,
+            labels=None,
+            regression=self.regression,
         )
 
-    def assign_predict_dataset(self, X):
-        self.predict_dataset = self.preprocess_new_data(X)
+    def assign_predict_dataset(self, X, embeddings=None):
+        self.predict_dataset = self.preprocess_new_data(X, embeddings)
 
-    def assign_test_dataset(self, X):
-        self.test_dataset = self.preprocess_new_data(X)
+    def assign_test_dataset(self, X, embeddings=None):
+        self.test_dataset = self.preprocess_new_data(X, embeddings)
 
     def train_dataloader(self):
         """Returns the training dataloader.

From cbe8dd36bb63554d1eba9e9cd7d8416abdddfeba Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:43:00 +0100
Subject: [PATCH 06/24] dataset returns tuple of data (cat, num, emb), label

---
 mambular/data_utils/dataset.py | 32 ++++++++++++++++++++++++++------
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/mambular/data_utils/dataset.py b/mambular/data_utils/dataset.py
index 6bb0485..db6c63a 100644
--- a/mambular/data_utils/dataset.py
+++ b/mambular/data_utils/dataset.py
@@ -11,13 +11,22 @@ class MambularDataset(Dataset):
     ----------
         cat_features_list (list of Tensors): A list of tensors representing the categorical features.
         num_features_list (list of Tensors): A list of tensors representing the numerical features.
+        embeddings_list (list of Tensors, optional): A list of tensors representing the embeddings.
         labels (Tensor, optional): A tensor of labels. If None, the dataset is used for prediction.
         regression (bool, optional): A flag indicating if the dataset is for a regression task. Defaults to True.
     """
 
-    def __init__(self, cat_features_list, num_features_list, labels=None, regression=True):
+    def __init__(
+        self,
+        cat_features_list,
+        num_features_list,
+        embeddings_list=None,
+        labels=None,
+        regression=True,
+    ):
         self.cat_features_list = cat_features_list  # Categorical features tensors
         self.num_features_list = num_features_list  # Numerical features tensors
+        self.embeddings_list = embeddings_list  # Embeddings tensors (optional)
         self.regression = regression
 
         if labels is not None:
@@ -46,15 +55,25 @@ def __getitem__(self, idx):
 
         Returns
         -------
-            tuple: A tuple containing two lists of tensors (one for categorical features and one for numerical features)
-            and a single label (if available).
+            tuple: A tuple containing lists of tensors for numerical features, categorical features, embeddings
+            (if available), and a label (if available).
         """
-        cat_features = [feature_tensor[idx] for feature_tensor in self.cat_features_list]
+        cat_features = [
+            feature_tensor[idx] for feature_tensor in self.cat_features_list
+        ]
         num_features = [
             torch.as_tensor(feature_tensor[idx]).clone().detach().to(torch.float32)
             for feature_tensor in self.num_features_list
         ]
 
+        if self.embeddings_list is not None:
+            embeddings = [
+                torch.as_tensor(embed_tensor[idx]).clone().detach().to(torch.float32)
+                for embed_tensor in self.embeddings_list
+            ]
+        else:
+            embeddings = None
+
         if self.labels is not None:
             label = self.labels[idx]
             if self.regression:
@@ -63,6 +82,7 @@ def __getitem__(self, idx):
                 label = label.clone().detach().to(torch.float32)
             else:
                 label = label.clone().detach().to(torch.long)
-            return num_features, cat_features, label
+
+            return (num_features, cat_features, embeddings), label
         else:
-            return num_features, cat_features  # No label in prediction mode
+            return (num_features, cat_features, embeddings)

From b84aa50d1a93c0d709805556126bd07cf4c192ee Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:43:15 +0100
Subject: [PATCH 07/24] adjust two first basemodel configs to handle projection
 for embeddings

---
 mambular/configs/mlp_config.py       | 1 +
 mambular/configs/tabularnn_config.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index 0c43cc1..08711be 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -83,3 +83,4 @@ class DefaultMLPConfig:
     plr_lite: bool = False
     n_frequencies: int = 48
     frequencies_init_scale: float = 0.01
+    embedding_projection: bool = True
diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index 037c96d..f945fbe 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -97,6 +97,7 @@ class DefaultTabulaRNNConfig:
     frequencies_init_scale: float = 0.01
     embedding_activation: Callable = nn.ReLU()  # noqa: RUF009
     layer_norm_after_embedding: bool = False
+    embedding_projection: bool = True
 
     # Head params
     head_layer_sizes: list = field(default_factory=list)

From 8cc3e8368d0b30bfb0aa5c426dce66e6bab4b1ad Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:43:29 +0100
Subject: [PATCH 08/24] adapt first only regressor and classifier to handle
 embeddings

---
 mambular/models/sklearn_base_classifier.py | 161 ++++++++++++++++-----
 mambular/models/sklearn_base_regressor.py  | 151 ++++++++++++++-----
 2 files changed, 235 insertions(+), 77 deletions(-)

diff --git a/mambular/models/sklearn_base_classifier.py b/mambular/models/sklearn_base_classifier.py
index 6317e62..1149e14 100644
--- a/mambular/models/sklearn_base_classifier.py
+++ b/mambular/models/sklearn_base_classifier.py
@@ -8,7 +8,7 @@
 import torch
 from lightning.pytorch.callbacks import EarlyStopping, ModelCheckpoint, ModelSummary
 from sklearn.base import BaseEstimator
-from sklearn.metrics import accuracy_score, log_loss, mean_squared_error
+from sklearn.metrics import accuracy_score, log_loss
 from skopt import gp_minimize
 from torch.utils.data import DataLoader
 from tqdm import tqdm
@@ -16,7 +16,11 @@
 from ..base_models.lightning_wrapper import TaskModel
 from ..data_utils.datamodule import MambularDataModule
 from ..preprocessing import Preprocessor
-from ..utils.config_mapper import activation_mapper, get_search_space, round_to_nearest_16
+from ..utils.config_mapper import (
+    activation_mapper,
+    get_search_space,
+    round_to_nearest_16,
+)
 
 
 class SklearnBaseClassifier(BaseEstimator):
@@ -39,11 +43,15 @@ def __init__(self, model, config, **kwargs):
         ]
 
         self.config_kwargs = {
-            k: v for k, v in kwargs.items() if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
+            k: v
+            for k, v in kwargs.items()
+            if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
         }
         self.config = config(**self.config_kwargs)
 
-        preprocessor_kwargs = {k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names}
+        preprocessor_kwargs = {
+            k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names
+        }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.task_model = None
@@ -63,7 +71,8 @@ def __init__(self, model, config, **kwargs):
         self.optimizer_kwargs = {
             k: v
             for k, v in kwargs.items()
-            if k not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
+            if k
+            not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
             and k.startswith("optimizer_")
         }
 
@@ -84,7 +93,10 @@ def get_params(self, deep=True):
         params.update(self.config_kwargs)
 
         if deep:
-            preprocessor_params = {"prepro__" + key: value for key, value in self.preprocessor.get_params().items()}
+            preprocessor_params = {
+                "prepro__" + key: value
+                for key, value in self.preprocessor.get_params().items()
+            }
             params.update(preprocessor_params)
 
         return params
@@ -102,8 +114,14 @@ def set_params(self, **parameters):
         self : object
             Estimator instance.
         """
-        config_params = {k: v for k, v in parameters.items() if not k.startswith("prepro__")}
-        preprocessor_params = {k.split("__")[1]: v for k, v in parameters.items() if k.startswith("prepro__")}
+        config_params = {
+            k: v for k, v in parameters.items() if not k.startswith("prepro__")
+        }
+        preprocessor_params = {
+            k.split("__")[1]: v
+            for k, v in parameters.items()
+            if k.startswith("prepro__")
+        }
 
         if config_params:
             self.config_kwargs.update(config_params)
@@ -125,6 +143,8 @@ def build_model(
         val_size: float = 0.2,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
@@ -201,7 +221,16 @@ def build_model(
             **dataloader_kwargs,
         )
 
-        self.data_module.preprocess_data(X, y, X_val, y_val, val_size=val_size, random_state=random_state)
+        self.data_module.preprocess_data(
+            X,
+            y,
+            X_val=X_val,
+            y_val=y_val,
+            embeddings_train=embeddings,
+            embeddings_val=embeddings_val,
+            val_size=val_size,
+            random_state=random_state,
+        )
 
         num_classes = len(np.unique(np.array(y)))
 
@@ -209,12 +238,19 @@ def build_model(
             model_class=self.base_model,  # type: ignore
             num_classes=num_classes,
             config=self.config,
-            cat_feature_info=self.data_module.cat_feature_info,
-            num_feature_info=self.data_module.num_feature_info,
-            lr_patience=(lr_patience if lr_patience is not None else self.config.lr_patience),
+            feature_information=(
+                self.data_module.num_feature_info,
+                self.data_module.cat_feature_info,
+                self.data_module.embedding_feature_info,
+            ),
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr=lr if lr is not None else self.config.lr,
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=(weight_decay if weight_decay is not None else self.config.weight_decay),
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             train_metrics=train_metrics,
             val_metrics=val_metrics,
             optimizer_type=self.optimizer_type,
@@ -245,7 +281,9 @@ def get_number_of_params(self, requires_grad=True):
             If the model has not been built prior to calling this method.
         """
         if not self.built:
-            raise ValueError("The model must be built before the number of parameters can be estimated")
+            raise ValueError(
+                "The model must be built before the number of parameters can be estimated"
+            )
         else:
             if requires_grad:
                 return sum(p.numel() for p in self.task_model.parameters() if p.requires_grad)  # type: ignore
@@ -259,6 +297,8 @@ def fit(
         val_size: float = 0.2,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         max_epochs: int = 100,
         random_state: int = 101,
         batch_size: int = 128,
@@ -340,6 +380,8 @@ def fit(
                 val_size=val_size,
                 X_val=X_val,
                 y_val=y_val,
+                embeddings=embeddings,
+                embeddings_val=embeddings_val,
                 random_state=random_state,
                 batch_size=batch_size,
                 shuffle=shuffle,
@@ -390,7 +432,7 @@ def fit(
 
         return self
 
-    def predict(self, X, device=None):
+    def predict(self, X, embeddings=None, device=None):
         """Predicts target labels for the given input samples.
 
         Parameters
@@ -408,7 +450,7 @@ def predict(self, X, device=None):
             raise ValueError("The model or data module has not been fitted yet.")
 
         # Preprocess the data using the data module
-        self.data_module.assign_predict_dataset(X)
+        self.data_module.assign_predict_dataset(X, embeddings)
 
         # Set model to evaluation mode
         self.task_model.eval()
@@ -438,7 +480,7 @@ def predict(self, X, device=None):
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()
 
-    def predict_proba(self, X, device=None):
+    def predict_proba(self, X, embeddings=None, device=None):
         """Predicts class probabilities for the given input samples.
 
         Parameters
@@ -482,7 +524,7 @@ def predict_proba(self, X, device=None):
         # Convert probabilities to NumPy array and return
         return probabilities.cpu().numpy()
 
-    def evaluate(self, X, y_true, metrics=None):
+    def evaluate(self, X, y_true, embeddings=None, metrics=None):
         """Evaluate the model on the given data using specified metrics.
 
         Parameters
@@ -491,6 +533,8 @@ def evaluate(self, X, y_true, metrics=None):
             The input samples to predict.
         y_true : array-like of shape (n_samples,)
             The true class labels against which to evaluate the predictions.
+        embneddings : array-like or list of shape(n_samples, dimension)
+            List or array with embeddings for unstructured data inputs
         metrics : dict
             A dictionary where keys are metric names and values are tuples containing the metric function
             and a boolean indicating whether the metric requires probability scores (True) or class labels (False).
@@ -518,11 +562,11 @@ def evaluate(self, X, y_true, metrics=None):
 
         # Generate class probabilities if any metric requires them
         if any(use_proba for _, use_proba in metrics.values()):
-            probabilities = self.predict_proba(X)
+            probabilities = self.predict_proba(X, embeddings)
 
         # Generate class labels if any metric requires them
         if any(not use_proba for _, use_proba in metrics.values()):
-            predictions = self.predict(X)
+            predictions = self.predict(X, embeddings)
 
         # Compute each metric
         for metric_name, (metric_func, use_proba) in metrics.items():
@@ -533,7 +577,7 @@ def evaluate(self, X, y_true, metrics=None):
 
         return scores
 
-    def score(self, X, y, metric=(log_loss, True)):
+    def score(self, X, y, embeddings=None, metric=(log_loss, True)):
         """Calculate the score of the model using the specified metric.
 
         Parameters
@@ -557,13 +601,13 @@ def score(self, X, y, metric=(log_loss, True)):
             X = pd.DataFrame(X)
 
         if use_proba:
-            probabilities = self.predict_proba(X)
+            probabilities = self.predict_proba(X, embeddings)
             return metric_func(y, probabilities)
         else:
-            predictions = self.predict(X)
+            predictions = self.predict(X, embeddings)
             return metric_func(y, predictions)
 
-    def encode(self, X, batch_size=64):
+    def encode(self, X, embeddings=None, batch_size=64):
         """
         Encodes input data using the trained model's embedding layer.
 
@@ -587,14 +631,16 @@ def encode(self, X, batch_size=64):
         # Ensure model and data module are initialized
         if self.task_model is None or self.data_module is None:
             raise ValueError("The model or data module has not been fitted yet.")
-        encoded_dataset = self.data_module.preprocess_new_data(X)
+        encoded_dataset = self.data_module.preprocess_new_data(X, embeddings)
 
         data_loader = DataLoader(encoded_dataset, batch_size=batch_size, shuffle=False)
 
         # Process data in batches
         encoded_outputs = []
-        for num_features, cat_features in tqdm(data_loader):
-            embeddings = self.task_model.base_model.encode(num_features, cat_features)  # Call your encode function
+        for batch in tqdm(data_loader):
+            embeddings = self.task_model.base_model.encode(
+                batch
+            )  # Call your encode function
             encoded_outputs.append(embeddings)
 
         # Concatenate all encoded outputs
@@ -608,6 +654,8 @@ def optimize_hparams(
         y,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         time=100,
         max_epochs=200,
         prune_by_epoch=True,
@@ -658,13 +706,25 @@ def optimize_hparams(
         )
 
         # Initial model fitting to get the baseline validation loss
-        self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
+        self.fit(
+            X,
+            y,
+            X_val=X_val,
+            y_val=y_val,
+            embeddings=embeddings,
+            embeddings_val=embeddings_val,
+            max_epochs=max_epochs,
+        )
         best_val_loss = float("inf")
 
         if X_val is not None and y_val is not None:
-            val_loss = self.evaluate(X_val, y_val, metrics={"Accuracy": (accuracy_score, False)})["Accuracy"]
+            val_loss = self.evaluate(
+                X_val, y_val, metrics={"Accuracy": (accuracy_score, False)}
+            )["Accuracy"]
         else:
-            val_loss = self.trainer.validate(self.task_model, self.data_module)[0]["val_loss"]
+            val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                "val_loss"
+            ]
 
         best_val_loss = val_loss
         best_epoch_val_loss = self.task_model.epoch_val_loss_at(  # type: ignore
@@ -690,7 +750,9 @@ def _objective(hyperparams):
                         if param_value in activation_mapper:
                             setattr(self.config, key, activation_mapper[param_value])
                         else:
-                            raise ValueError(f"Unknown activation function: {param_value}")
+                            raise ValueError(
+                                f"Unknown activation function: {param_value}"
+                            )
                     else:
                         setattr(self.config, key, param_value)
 
@@ -699,11 +761,15 @@ def _objective(hyperparams):
                 self.config.head_layer_sizes = head_layer_sizes[:head_layer_size_length]
 
             # Build the model with updated hyperparameters
-            self.build_model(X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs)
+            self.build_model(
+                X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs
+            )
 
             # Dynamically set the early pruning threshold
             if prune_by_epoch:
-                early_pruning_threshold = best_epoch_val_loss * 1.5  # Prune based on specific epoch loss
+                early_pruning_threshold = (
+                    best_epoch_val_loss * 1.5
+                )  # Prune based on specific epoch loss
             else:
                 # Prune based on the best overall validation loss
                 early_pruning_threshold = best_val_loss * 1.5
@@ -715,15 +781,26 @@ def _objective(hyperparams):
             # Fit the model (limit epochs for faster optimization)
             try:
                 # Wrap the risky operation (model fitting) in a try-except block
-                self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False)
+                self.fit(
+                    X,
+                    y,
+                    X_val=X_val,
+                    y_val=y_val,
+                    embeddings=embeddings,
+                    embeddings_val=embeddings_val,
+                    max_epochs=max_epochs,
+                    rebuild=False,
+                )
 
                 # Evaluate validation loss
                 if X_val is not None and y_val is not None:
-                    val_loss = self.evaluate(X_val, y_val, metrics={"Mean Squared Error": mean_squared_error})[  # type: ignore
+                    val_loss = self.evaluate(X_val, y_val, metrics={"Accuracy": (accuracy_score, False)})[  # type: ignore
                         "Mean Squared Error"
                     ]
                 else:
-                    val_loss = self.trainer.validate(self.task_model, self.data_module)[0]["val_loss"]
+                    val_loss = self.trainer.validate(self.task_model, self.data_module)[
+                        0
+                    ]["val_loss"]
 
                 # Pruning based on validation loss at specific epoch
                 epoch_val_loss = self.task_model.epoch_val_loss_at(  # type: ignore
@@ -740,15 +817,21 @@ def _objective(hyperparams):
 
             except Exception as e:
                 # Penalize the hyperparameter configuration with a large value
-                print(f"Error encountered during fit with hyperparameters {hyperparams}: {e}")
-                return best_val_loss * 100  # Large value to discourage this configuration
+                print(
+                    f"Error encountered during fit with hyperparameters {hyperparams}: {e}"
+                )
+                return (
+                    best_val_loss * 100
+                )  # Large value to discourage this configuration
 
         # Perform Bayesian optimization using scikit-optimize
         result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
 
         # Update the model with the best-found hyperparameters
         best_hparams = result.x  # type: ignore
-        head_layer_sizes = [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        head_layer_sizes = (
+            [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        )
         layer_sizes = [] if "layer_sizes" in self.config.__dataclass_fields__ else None
 
         # Iterate over the best hyperparameters found by optimization
diff --git a/mambular/models/sklearn_base_regressor.py b/mambular/models/sklearn_base_regressor.py
index 04f9ac3..94e9bac 100644
--- a/mambular/models/sklearn_base_regressor.py
+++ b/mambular/models/sklearn_base_regressor.py
@@ -41,11 +41,15 @@ def __init__(self, model, config, **kwargs):
         ]
 
         self.config_kwargs = {
-            k: v for k, v in kwargs.items() if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
+            k: v
+            for k, v in kwargs.items()
+            if k not in self.preprocessor_arg_names and not k.startswith("optimizer")
         }
         self.config = config(**self.config_kwargs)
 
-        preprocessor_kwargs = {k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names}
+        preprocessor_kwargs = {
+            k: v for k, v in kwargs.items() if k in self.preprocessor_arg_names
+        }
 
         self.preprocessor = Preprocessor(**preprocessor_kwargs)
         self.base_model = model
@@ -65,7 +69,8 @@ def __init__(self, model, config, **kwargs):
         self.optimizer_kwargs = {
             k: v
             for k, v in kwargs.items()
-            if k not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
+            if k
+            not in ["lr", "weight_decay", "patience", "lr_patience", "optimizer_type"]
             and k.startswith("optimizer_")
         }
 
@@ -86,7 +91,10 @@ def get_params(self, deep=True):
         params.update(self.config_kwargs)
 
         if deep:
-            preprocessor_params = {"prepro__" + key: value for key, value in self.preprocessor.get_params().items()}
+            preprocessor_params = {
+                "prepro__" + key: value
+                for key, value in self.preprocessor.get_params().items()
+            }
             params.update(preprocessor_params)
 
         return params
@@ -104,8 +112,14 @@ def set_params(self, **parameters):
         self : object
             Estimator instance.
         """
-        config_params = {k: v for k, v in parameters.items() if not k.startswith("prepro__")}
-        preprocessor_params = {k.split("__")[1]: v for k, v in parameters.items() if k.startswith("prepro__")}
+        config_params = {
+            k: v for k, v in parameters.items() if not k.startswith("prepro__")
+        }
+        preprocessor_params = {
+            k.split("__")[1]: v
+            for k, v in parameters.items()
+            if k.startswith("prepro__")
+        }
 
         if config_params:
             self.config_kwargs.update(config_params)
@@ -127,6 +141,8 @@ def build_model(
         val_size: float = 0.2,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         random_state: int = 101,
         batch_size: int = 128,
         shuffle: bool = True,
@@ -203,17 +219,33 @@ def build_model(
             **dataloader_kwargs,
         )
 
-        self.data_module.preprocess_data(X, y, X_val, y_val, val_size=val_size, random_state=random_state)
+        self.data_module.preprocess_data(
+            X,
+            y,
+            X_val=X_val,
+            y_val=y_val,
+            embeddings_train=embeddings,
+            embeddings_val=embeddings_val,
+            val_size=val_size,
+            random_state=random_state,
+        )
 
         self.task_model = TaskModel(
             model_class=self.base_model,  # type: ignore
             config=self.config,
-            cat_feature_info=self.data_module.cat_feature_info,
-            num_feature_info=self.data_module.num_feature_info,
+            feature_information=(
+                self.data_module.num_feature_info,
+                self.data_module.cat_feature_info,
+                self.data_module.embedding_feature_info,
+            ),
             lr=lr if lr is not None else self.config.lr,
-            lr_patience=(lr_patience if lr_patience is not None else self.config.lr_patience),
+            lr_patience=(
+                lr_patience if lr_patience is not None else self.config.lr_patience
+            ),
             lr_factor=lr_factor if lr_factor is not None else self.config.lr_factor,
-            weight_decay=(weight_decay if weight_decay is not None else self.config.weight_decay),
+            weight_decay=(
+                weight_decay if weight_decay is not None else self.config.weight_decay
+            ),
             train_metrics=train_metrics,
             val_metrics=val_metrics,
             optimizer_type=self.optimizer_type,
@@ -244,7 +276,9 @@ def get_number_of_params(self, requires_grad=True):
             If the model has not been built prior to calling this method.
         """
         if not self.built:
-            raise ValueError("The model must be built before the number of parameters can be estimated")
+            raise ValueError(
+                "The model must be built before the number of parameters can be estimated"
+            )
         else:
             if requires_grad:
                 return sum(p.numel() for p in self.task_model.parameters() if p.requires_grad)  # type: ignore
@@ -258,6 +292,8 @@ def fit(
         val_size: float = 0.2,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         max_epochs: int = 100,
         random_state: int = 101,
         batch_size: int = 128,
@@ -339,6 +375,8 @@ def fit(
                 val_size=val_size,
                 X_val=X_val,
                 y_val=y_val,
+                embeddings=embeddings,
+                embeddings_val=embeddings_val,
                 random_state=random_state,
                 batch_size=batch_size,
                 shuffle=shuffle,
@@ -389,7 +427,7 @@ def fit(
 
         return self
 
-    def predict(self, X, device=None):
+    def predict(self, X, embeddings=None, device=None):
         """Predicts target values for the given input samples.
 
         Parameters
@@ -408,7 +446,7 @@ def predict(self, X, device=None):
             raise ValueError("The model or data module has not been fitted yet.")
 
         # Preprocess the data using the data module
-        self.data_module.assign_predict_dataset(X)
+        self.data_module.assign_predict_dataset(X, embeddings)
 
         # Set model to evaluation mode
         self.task_model.eval()
@@ -426,7 +464,7 @@ def predict(self, X, device=None):
         # Convert predictions to NumPy array and return
         return predictions.cpu().numpy()
 
-    def evaluate(self, X, y_true, metrics=None):
+    def evaluate(self, X, y_true, embeddings=None, metrics=None):
         """Evaluate the model on the given data using specified metrics.
 
         Parameters
@@ -452,7 +490,7 @@ def evaluate(self, X, y_true, metrics=None):
             metrics = {"Mean Squared Error": mean_squared_error}
 
         # Generate predictions using the trained model
-        predictions = self.predict(X)
+        predictions = self.predict(X, embeddings=embeddings)
 
         # Initialize dictionary to store results
         scores = {}
@@ -463,7 +501,7 @@ def evaluate(self, X, y_true, metrics=None):
 
         return scores
 
-    def score(self, X, y, metric=mean_squared_error):
+    def score(self, X, y, embeddings=None, metric=mean_squared_error):
         """Calculate the score of the model using the specified metric.
 
         Parameters
@@ -480,10 +518,10 @@ def score(self, X, y, metric=mean_squared_error):
         score : float
             The score calculated using the specified metric.
         """
-        predictions = self.predict(X)
+        predictions = self.predict(X, embeddings)
         return metric(y, predictions)
 
-    def encode(self, X, batch_size=64):
+    def encode(self, X, embeddings=None, batch_size=64):
         """
         Encodes input data using the trained model's embedding layer.
 
@@ -507,14 +545,16 @@ def encode(self, X, batch_size=64):
         # Ensure model and data module are initialized
         if self.task_model is None or self.data_module is None:
             raise ValueError("The model or data module has not been fitted yet.")
-        encoded_dataset = self.data_module.preprocess_new_data(X)
+        encoded_dataset = self.data_module.preprocess_new_data(X, embeddings)
 
         data_loader = DataLoader(encoded_dataset, batch_size=batch_size, shuffle=False)
 
         # Process data in batches
         encoded_outputs = []
-        for num_features, cat_features in tqdm(data_loader):
-            embeddings = self.task_model.base_model.encode(num_features, cat_features)  # Call your encode function
+        for batch in tqdm(data_loader):
+            embeddings = self.task_model.base_model.encode(
+                batch
+            )  # Call your encode function
             encoded_outputs.append(embeddings)
 
         # Concatenate all encoded outputs
@@ -528,6 +568,8 @@ def optimize_hparams(
         y,
         X_val=None,
         y_val=None,
+        embeddings=None,
+        embeddings_val=None,
         time=100,
         max_epochs=200,
         prune_by_epoch=True,
@@ -578,15 +620,25 @@ def optimize_hparams(
         )
 
         # Initial model fitting to get the baseline validation loss
-        self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs)
+        self.fit(
+            X,
+            y,
+            X_val=X_val,
+            y_val=y_val,
+            embeddings=embeddings,
+            embeddings_val=embeddings_val,
+            max_epochs=max_epochs,
+        )
         best_val_loss = float("inf")
 
         if X_val is not None and y_val is not None:
-            val_loss = self.evaluate(X_val, y_val, metrics={"Mean Squared Error": mean_squared_error})[
-                "Mean Squared Error"
-            ]
+            val_loss = self.evaluate(
+                X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+            )["Mean Squared Error"]
         else:
-            val_loss = self.trainer.validate(self.task_model, self.data_module)[0]["val_loss"]
+            val_loss = self.trainer.validate(self.task_model, self.data_module)[0][
+                "val_loss"
+            ]
 
         best_val_loss = val_loss
         best_epoch_val_loss = self.task_model.epoch_val_loss_at(  # type: ignore
@@ -612,7 +664,9 @@ def _objective(hyperparams):
                         if param_value in activation_mapper:
                             setattr(self.config, key, activation_mapper[param_value])
                         else:
-                            raise ValueError(f"Unknown activation function: {param_value}")
+                            raise ValueError(
+                                f"Unknown activation function: {param_value}"
+                            )
                     else:
                         setattr(self.config, key, param_value)
 
@@ -621,11 +675,22 @@ def _objective(hyperparams):
                 self.config.head_layer_sizes = head_layer_sizes[:head_layer_size_length]
 
             # Build the model with updated hyperparameters
-            self.build_model(X, y, X_val=X_val, y_val=y_val, lr=self.config.lr, **optimize_kwargs)
+            self.build_model(
+                X,
+                y,
+                X_val=X_val,
+                y_val=y_val,
+                embeddings=embeddings,
+                embeddings_val=embeddings_val,
+                lr=self.config.lr,
+                **optimize_kwargs,
+            )
 
             # Dynamically set the early pruning threshold
             if prune_by_epoch:
-                early_pruning_threshold = best_epoch_val_loss * 1.5  # Prune based on specific epoch loss
+                early_pruning_threshold = (
+                    best_epoch_val_loss * 1.5
+                )  # Prune based on specific epoch loss
             else:
                 # Prune based on the best overall validation loss
                 early_pruning_threshold = best_val_loss * 1.5
@@ -636,15 +701,19 @@ def _objective(hyperparams):
 
             try:
                 # Wrap the risky operation (model fitting) in a try-except block
-                self.fit(X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False)
+                self.fit(
+                    X, y, X_val=X_val, y_val=y_val, max_epochs=max_epochs, rebuild=False
+                )
 
                 # Evaluate validation loss
                 if X_val is not None and y_val is not None:
-                    val_loss = self.evaluate(X_val, y_val, metrics={"Mean Squared Error": mean_squared_error})[
-                        "Mean Squared Error"
-                    ]
+                    val_loss = self.evaluate(
+                        X_val, y_val, metrics={"Mean Squared Error": mean_squared_error}
+                    )["Mean Squared Error"]
                 else:
-                    val_loss = self.trainer.validate(self.task_model, self.data_module)[0]["val_loss"]
+                    val_loss = self.trainer.validate(self.task_model, self.data_module)[
+                        0
+                    ]["val_loss"]
 
                 # Pruning based on validation loss at specific epoch
                 epoch_val_loss = self.task_model.epoch_val_loss_at(  # type: ignore
@@ -661,15 +730,21 @@ def _objective(hyperparams):
 
             except Exception as e:
                 # Penalize the hyperparameter configuration with a large value
-                print(f"Error encountered during fit with hyperparameters {hyperparams}: {e}")
-                return best_val_loss * 100  # Large value to discourage this configuration
+                print(
+                    f"Error encountered during fit with hyperparameters {hyperparams}: {e}"
+                )
+                return (
+                    best_val_loss * 100
+                )  # Large value to discourage this configuration
 
         # Perform Bayesian optimization using scikit-optimize
         result = gp_minimize(_objective, param_space, n_calls=time, random_state=42)
 
         # Update the model with the best-found hyperparameters
         best_hparams = result.x  # type: ignore
-        head_layer_sizes = [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        head_layer_sizes = (
+            [] if "head_layer_sizes" in self.config.__dataclass_fields__ else None
+        )
         layer_sizes = [] if "layer_sizes" in self.config.__dataclass_fields__ else None
 
         # Iterate over the best hyperparameters found by optimization

From 6c0bc5c52aba0b8de598ca1cab9cc6ca3ebca1d0 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:43:47 +0100
Subject: [PATCH 09/24] preprocessor does not preprocess embeddings, but takes
 them as input to include them in feature information

---
 mambular/preprocessing/preprocessor.py | 178 +++++++++++++++++++------
 1 file changed, 136 insertions(+), 42 deletions(-)

diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index a691649..0fa7340 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -111,10 +111,14 @@ def __init__(
     ):
         self.n_bins = n_bins
         self.numerical_preprocessing = (
-            numerical_preprocessing.lower() if numerical_preprocessing is not None else "none"
+            numerical_preprocessing.lower()
+            if numerical_preprocessing is not None
+            else "none"
         )
         self.categorical_preprocessing = (
-            categorical_preprocessing.lower() if categorical_preprocessing is not None else "none"
+            categorical_preprocessing.lower()
+            if categorical_preprocessing is not None
+            else "none"
         )
         if self.numerical_preprocessing not in [
             "ple",
@@ -237,20 +241,40 @@ def _detect_column_types(self, X):
                 numerical_features.append(col)
             else:
                 if isinstance(self.cat_cutoff, float):
-                    cutoff_condition = (num_unique_values / total_samples) < self.cat_cutoff
+                    cutoff_condition = (
+                        num_unique_values / total_samples
+                    ) < self.cat_cutoff
                 elif isinstance(self.cat_cutoff, int):
                     cutoff_condition = num_unique_values < self.cat_cutoff
                 else:
-                    raise ValueError("cat_cutoff should be either a float or an integer.")
+                    raise ValueError(
+                        "cat_cutoff should be either a float or an integer."
+                    )
 
-                if X[col].dtype.kind not in "iufc" or (X[col].dtype.kind == "i" and cutoff_condition):
+                if X[col].dtype.kind not in "iufc" or (
+                    X[col].dtype.kind == "i" and cutoff_condition
+                ):
                     categorical_features.append(col)
                 else:
                     numerical_features.append(col)
 
         return numerical_features, categorical_features
 
-    def fit(self, X, y=None):
+    def _fit_embeddings(self, embeddings):
+        if embeddings is not None:
+            self.embeddings = True
+            self.embedding_dimensions = {}
+            if isinstance(embeddings, np.ndarray):
+                self.embedding_dimensions["embeddings_1"] = embeddings.shape[1]
+            elif isinstance(embeddings, list) and all(
+                isinstance(e, np.ndarray) for e in embeddings
+            ):
+                for idx, e in enumerate(embeddings):
+                    self.embedding_dimensions[f"embedding_{idx+1}"] = e.shape[1]
+        else:
+            self.embeddings = False
+
+    def fit(self, X, y=None, embeddings=None):
         """Fits the preprocessor to the data by identifying feature types and configuring the appropriate
         transformations for each feature. It sets up a column transformer with a pipeline of transformations for
         numerical and categorical features based on the specified preprocessing strategy.
@@ -269,6 +293,8 @@ def fit(self, X, y=None):
         if isinstance(X, dict):
             X = pd.DataFrame(X)
 
+        self._fit_embeddings(embeddings)
+
         numerical_features, categorical_features = self._detect_column_types(X)
         transformers = []
 
@@ -308,7 +334,11 @@ def fit(self, X, y=None):
                                 (
                                     "discretizer",
                                     KBinsDiscretizer(
-                                        n_bins=(bins if isinstance(bins, int) else len(bins) - 1),
+                                        n_bins=(
+                                            bins
+                                            if isinstance(bins, int)
+                                            else len(bins) - 1
+                                        ),
                                         encode="ordinal",
                                         strategy=self.binning_strategy,  # type: ignore
                                         subsample=200_000 if len(X) > 200_000 else None,
@@ -337,13 +367,17 @@ def fit(self, X, y=None):
                     numeric_transformer_steps.append(("scaler", StandardScaler()))
 
                 elif self.numerical_preprocessing == "minmax":
-                    numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                    numeric_transformer_steps.append(
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                    )
 
                 elif self.numerical_preprocessing == "quantile":
                     numeric_transformer_steps.append(
                         (
                             "quantile",
-                            QuantileTransformer(n_quantiles=self.n_bins, random_state=101),
+                            QuantileTransformer(
+                                n_quantiles=self.n_bins, random_state=101
+                            ),
                         )
                     )
 
@@ -351,7 +385,9 @@ def fit(self, X, y=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "polynomial",
@@ -366,7 +402,9 @@ def fit(self, X, y=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "splines",
@@ -385,7 +423,9 @@ def fit(self, X, y=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "rbf",
@@ -402,7 +442,9 @@ def fit(self, X, y=None):
                     if self.scaling_strategy == "standardization":
                         numeric_transformer_steps.append(("scaler", StandardScaler()))
                     elif self.scaling_strategy == "minmax":
-                        numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
+                        numeric_transformer_steps.append(
+                            ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                        )
                     numeric_transformer_steps.append(
                         (
                             "sigmoid",
@@ -416,8 +458,12 @@ def fit(self, X, y=None):
                     )
 
                 elif self.numerical_preprocessing == "ple":
-                    numeric_transformer_steps.append(("minmax", MinMaxScaler(feature_range=(-1, 1))))
-                    numeric_transformer_steps.append(("ple", PLE(n_bins=self.n_bins, task=self.task)))
+                    numeric_transformer_steps.append(
+                        ("minmax", MinMaxScaler(feature_range=(-1, 1)))
+                    )
+                    numeric_transformer_steps.append(
+                        ("ple", PLE(n_bins=self.n_bins, task=self.task))
+                    )
 
                 elif self.numerical_preprocessing == "box-cox":
                     numeric_transformer_steps.append(
@@ -483,12 +529,18 @@ def fit(self, X, y=None):
                         ]
                     )
                 else:
-                    raise ValueError(f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}")
+                    raise ValueError(
+                        f"Unknown categorical_preprocessing type: {self.categorical_preprocessing}"
+                    )
 
                 # Append the transformer for the current categorical feature
-                transformers.append((f"cat_{feature}", categorical_transformer, [feature]))
+                transformers.append(
+                    (f"cat_{feature}", categorical_transformer, [feature])
+                )
 
-        self.column_transformer = ColumnTransformer(transformers=transformers, remainder="passthrough")
+        self.column_transformer = ColumnTransformer(
+            transformers=transformers, remainder="passthrough"
+        )
         self.column_transformer.fit(X, y)
 
         self.fitted = True
@@ -514,16 +566,20 @@ def _get_decision_tree_bins(self, X, y, numerical_features):
         bins = []
         for feature in numerical_features:
             tree_model = (
-                DecisionTreeClassifier(max_depth=3) if y.dtype.kind in "bi" else DecisionTreeRegressor(max_depth=3)
+                DecisionTreeClassifier(max_depth=3)
+                if y.dtype.kind in "bi"
+                else DecisionTreeRegressor(max_depth=3)
             )
             tree_model.fit(X[[feature]], y)
             thresholds = tree_model.tree_.threshold[tree_model.tree_.feature != -2]  # type: ignore
             bin_edges = np.sort(np.unique(thresholds))
 
-            bins.append(np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()])))
+            bins.append(
+                np.concatenate(([X[feature].min()], bin_edges, [X[feature].max()]))
+            )
         return bins
 
-    def transform(self, X):
+    def transform(self, X, embeddings=None):
         """Transforms the input data using the preconfigured column transformer and converts the output into a
         dictionary format with keys corresponding to transformed feature names and values as arrays of transformed data.
 
@@ -538,8 +594,7 @@ def transform(self, X):
         Parameters
         ----------
             X (DataFrame): The input data to be transformed.
-            X (DataFrame): The input data to be transformed.
-
+            embeddings (np.array or list of np.arrays, optional): The embedding data to include in the transformation.
 
         Returns
         -------
@@ -554,6 +609,33 @@ def transform(self, X):
 
         # Now let's convert this into a dictionary of arrays, one per column
         transformed_dict = self._split_transformed_output(X, transformed_X)
+        if embeddings is not None:
+            assert self.embeddings is True, "self.embeddings should be True but is not."
+
+            if isinstance(embeddings, np.ndarray):
+                assert (
+                    self.embedding_dimensions["embedding_1"] == embeddings.shape[1]
+                ), (
+                    f"Expected embedding dimension {self.embedding_dimensions['embeddings']}, "
+                    f"but got {embeddings.shape[1]}"
+                )
+                transformed_dict["embedding_1"] = embeddings.astype(np.float32)
+            elif isinstance(embeddings, list) and all(
+                isinstance(e, np.ndarray) for e in embeddings
+            ):
+                for idx, e in enumerate(embeddings):
+                    key = f"embedding_{idx+1}"
+                    assert self.embedding_dimensions[key] == e.shape[1], (
+                        f"Expected embedding dimension {self.embedding_dimensions[key]} for {key}, "
+                        f"but got {e.shape[1]}"
+                    )
+                    transformed_dict[key] = e.astype(np.float32)
+        else:
+            assert (
+                self.embeddings is False
+            ), "self.embeddings should be False when embeddings are None."
+            self.embeddings = False
+
         return transformed_dict
 
     def _split_transformed_output(self, X, transformed_X):
@@ -592,7 +674,7 @@ def _split_transformed_output(self, X, transformed_X):
                 start = end
         return transformed_dict
 
-    def fit_transform(self, X, y=None):
+    def fit_transform(self, X, y=None, embeddings=None):
         """Fits the preprocessor to the data and then transforms the data using the fitted preprocessing pipelines. This
         is a convenience method that combines `fit` and `transform`.
 
@@ -607,9 +689,9 @@ def fit_transform(self, X, y=None):
             dict: A dictionary with the transformed data, where keys are the base feature names and
             values are the transformed features as arrays.
         """
-        self.fit(X, y)
+        self.fit(X, y, embeddings)
         self.fitted = True
-        return self.transform(X)
+        return self.transform(X, embeddings)
 
     def get_feature_info(self, verbose=True):
         """Retrieves information about how features are encoded within the model's preprocessor. This method identifies
@@ -619,24 +701,34 @@ def get_feature_info(self, verbose=True):
         This method should only be called after the preprocessor has been fitted, as it relies on the structure and
         configuration of the `column_transformer` attribute.
 
-
         Raises
         ------
             RuntimeError: If the `column_transformer` is not yet fitted, indicating that the preprocessor must be
             fitted before invoking this method.
 
-
         Returns
         -------
-            tuple of (dict, dict):
+            tuple of (dict, dict, dict):
                 - The first dictionary maps feature names to their respective number of bins or categories if they are
                   processed using discretization or ordinal encoding.
                 - The second dictionary includes feature names with other encoding details, such as the dimension of
                   features after encoding transformations (e.g., one-hot encoding dimensions).
+                - The third dictionary includes feature information for embeddings if available.
         """
         numerical_feature_info = {}
         categorical_feature_info = {}
 
+        if self.embeddings:
+            embedding_feature_info = {}
+            for key, dim in self.embedding_dimensions.items():
+                embedding_feature_info[key] = {
+                    "preprocessing": None,
+                    "dimension": dim,
+                    "categories": None,
+                }
+        else:
+            embedding_feature_info = None
+
         if not self.column_transformer:
             raise RuntimeError("The preprocessor has not been fitted yet.")
 
@@ -648,12 +740,10 @@ def get_feature_info(self, verbose=True):
             steps = [step[0] for step in transformer_pipeline.steps]
 
             for feature_name in columns:
-                # Initialize common fields
                 preprocessing_type = " -> ".join(steps)
                 dimension = None
                 categories = None
 
-                # Numerical features
                 if "discretizer" in steps or any(
                     step in steps
                     for step in [
@@ -666,23 +756,23 @@ def get_feature_info(self, verbose=True):
                 ):
                     last_step = transformer_pipeline.steps[-1][1]
                     if hasattr(last_step, "transform"):
-                        # Single-column input for dimension check
                         dummy_input = np.zeros((1, 1))
                         transformed_feature = last_step.transform(dummy_input)
                         dimension = transformed_feature.shape[1]
                     numerical_feature_info[feature_name] = {
                         "preprocessing": preprocessing_type,
                         "dimension": dimension,
-                        "categories": None,  # Numerical features don't have categories
+                        "categories": None,
                     }
                     if verbose:
-                        print(f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}")
+                        print(
+                            f"Numerical Feature: {feature_name}, Info: {numerical_feature_info[feature_name]}"
+                        )
 
-                # Categorical features
                 elif "continuous_ordinal" in steps:
                     step = transformer_pipeline.named_steps["continuous_ordinal"]
                     categories = len(step.mapping_[columns.index(feature_name)])
-                    dimension = 1  # Ordinal encoding always outputs one dimension
+                    dimension = 1
                     categorical_feature_info[feature_name] = {
                         "preprocessing": preprocessing_type,
                         "dimension": dimension,
@@ -697,7 +787,7 @@ def get_feature_info(self, verbose=True):
                     step = transformer_pipeline.named_steps["onehot"]
                     if hasattr(step, "categories_"):
                         categories = sum(len(cat) for cat in step.categories_)
-                        dimension = categories  # One-hot encoding expands into multiple dimensions
+                        dimension = categories
                     categorical_feature_info[feature_name] = {
                         "preprocessing": preprocessing_type,
                         "dimension": dimension,
@@ -708,7 +798,6 @@ def get_feature_info(self, verbose=True):
                             f"Categorical Feature (One-Hot): {feature_name}, Info: {categorical_feature_info[feature_name]}"
                         )
 
-                # Fallback for other transformations
                 else:
                     last_step = transformer_pipeline.steps[-1][1]
                     if hasattr(last_step, "transform"):
@@ -719,20 +808,25 @@ def get_feature_info(self, verbose=True):
                         categorical_feature_info[feature_name] = {
                             "preprocessing": preprocessing_type,
                             "dimension": dimension,
-                            "categories": None,  # Categories not defined for unknown categorical transformations
+                            "categories": None,
                         }
                     else:
                         numerical_feature_info[feature_name] = {
                             "preprocessing": preprocessing_type,
                             "dimension": dimension,
-                            "categories": None,  # Numerical features don't have categories
+                            "categories": None,
                         }
                     if verbose:
                         print(
-                            f"Categorical Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}"
+                            f"Feature: {feature_name}, Info: {preprocessing_type}, Dimension: {dimension}"
                         )
 
                 if verbose:
                     print("-" * 50)
 
-        return numerical_feature_info, categorical_feature_info
+        if verbose and self.embeddings:
+            print("Embeddings:")
+            for key, value in embedding_feature_info.items():
+                print(f"  Feature: {key}, Dimension: {value['dimension']}")
+
+        return numerical_feature_info, categorical_feature_info, embedding_feature_info

From 743c214c4a5f47b224535edb92b2d90bca6ff3ef Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Thu, 23 Jan 2025 19:44:06 +0100
Subject: [PATCH 10/24] feature dimensions adapted to new output format of
 get_feature_info

---
 mambular/utils/get_feature_dimensions.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/mambular/utils/get_feature_dimensions.py b/mambular/utils/get_feature_dimensions.py
index 7ad000d..b72980b 100644
--- a/mambular/utils/get_feature_dimensions.py
+++ b/mambular/utils/get_feature_dimensions.py
@@ -1,8 +1,10 @@
-def get_feature_dimensions(num_feature_info, cat_feature_info):
+def get_feature_dimensions(num_feature_info, cat_feature_info, embedding_info):
     input_dim = 0
-    for feature_name, feature_info in num_feature_info.items():
+    for _, feature_info in num_feature_info.items():
         input_dim += feature_info["dimension"]
-    for feature_name, feature_info in cat_feature_info.items():
+    for _, feature_info in cat_feature_info.items():
+        input_dim += feature_info["dimension"]
+    for _, feature_info in embedding_info.items():
         input_dim += feature_info["dimension"]
 
     return input_dim

From 4ec70f81549f9d5e3e0a6db06be53d91ccab07b5 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Fri, 24 Jan 2025 14:45:48 +0100
Subject: [PATCH 11/24] adapting all basemodels to new dataset __getitem__
 method

---
 mambular/base_models/ft_transformer.py | 23 +++++-------
 mambular/base_models/mambatab.py       | 31 ++++++++++-------
 mambular/base_models/mambattn.py       | 28 +++++++--------
 mambular/base_models/mambular.py       | 21 +++++------
 mambular/base_models/mlp.py            |  9 +++--
 mambular/base_models/ndtf.py           | 34 +++++++++---------
 mambular/base_models/node.py           | 26 +++++++-------
 mambular/base_models/resnet.py         | 39 ++++++++++++---------
 mambular/base_models/saint.py          | 27 ++++++---------
 mambular/base_models/tabm.py           | 48 ++++++++++++++++----------
 mambular/base_models/tabtransformer.py | 26 ++++++--------
 mambular/base_models/tabularnn.py      |  5 ++-
 12 files changed, 158 insertions(+), 159 deletions(-)

diff --git a/mambular/base_models/ft_transformer.py b/mambular/base_models/ft_transformer.py
index 56e546d..f0c7fb8 100644
--- a/mambular/base_models/ft_transformer.py
+++ b/mambular/base_models/ft_transformer.py
@@ -6,6 +6,7 @@
 from ..arch_utils.transformer_utils import CustomTransformerEncoderLayer
 from ..configs.fttransformer_config import DefaultFTTransformerConfig
 from .basemodel import BaseModel
+import numpy as np
 
 
 class FTTransformer(BaseModel):
@@ -52,22 +53,18 @@ class FTTransformer(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultFTTransformerConfig = DefaultFTTransformerConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *feature_information,
             config=config,
         )
 
@@ -87,25 +84,23 @@ def __init__(
         )
 
         # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        n_inputs = np.sum([len(info) for info in feature_information])
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         Tensor
             The output predictions of the model.
         """
-        x = self.embedding_layer(num_features, cat_features)
+        x = self.embedding_layer(*data)
 
         x = self.encoder(x)
 
diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index fa1e231..b1b111a 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -5,6 +5,7 @@
 from ..arch_utils.mamba_utils.mamba_arch import Mamba
 from ..arch_utils.mamba_utils.mamba_original import MambaOriginal
 from ..arch_utils.mlp_utils import MLPhead
+from ..utils.get_feature_dimensions import get_feature_dimensions
 from ..configs.mambatab_config import DefaultMambaTabConfig
 from .basemodel import BaseModel
 
@@ -56,23 +57,16 @@ class MambaTab(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultMambaTabConfig = DefaultMambaTabConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
-        input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            input_dim += 1
-        for feature_name, input_shape in cat_feature_info.items():
-            input_dim += 1
+        input_dim = get_feature_dimensions(*feature_information)
 
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
         self.returns_ensemble = False
 
         self.initial_layer = nn.Linear(input_dim, config.d_model)
@@ -93,9 +87,20 @@ def __init__(
         else:
             self.mamba = MambaOriginal(config)
 
-    def forward(self, num_features, cat_features):
-        x = num_features + cat_features
-        x = torch.cat(x, dim=1)
+    def forward(self, *data):
+        """Forward pass of the Mambatab model
+
+        Parameters
+        ----------
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor.
+        """
+        x = torch.cat([t for tensors in data for t in tensors], dim=1)
 
         x = self.initial_layer(x)
         if self.axis == 1:
diff --git a/mambular/base_models/mambattn.py b/mambular/base_models/mambattn.py
index f393154..fd86eee 100644
--- a/mambular/base_models/mambattn.py
+++ b/mambular/base_models/mambattn.py
@@ -1,5 +1,5 @@
 import torch
-
+import numpy as np
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.mamba_utils.mambattn_arch import MambAttn
@@ -52,14 +52,15 @@ class MambAttention(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultMambAttentionConfig = DefaultMambAttentionConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
+
+        self.returns_ensemble = False
 
         try:
             self.pooling_method = self.hparams.pooling_method
@@ -76,8 +77,7 @@ def __init__(
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *feature_information,
             config=config,
         )
 
@@ -101,25 +101,23 @@ def __init__(
             self.perm = torch.randperm(self.embedding_layer.seq_len)
 
         # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        n_inputs = np.sum([len(info) for info in feature_information])
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
-        Tensor
-            The output predictions of the model.
+        torch.Tensor
+            Output tensor.
         """
-        x = self.embedding_layer(num_features, cat_features)
+        x = self.embedding_layer(*data)
 
         if self.shuffle_embeddings:
             x = x[:, self.perm, :]
diff --git a/mambular/base_models/mambular.py b/mambular/base_models/mambular.py
index ee73b3d..f24df96 100644
--- a/mambular/base_models/mambular.py
+++ b/mambular/base_models/mambular.py
@@ -6,6 +6,7 @@
 from ..arch_utils.mlp_utils import MLPhead
 from ..configs.mambular_config import DefaultMambularConfig
 from .basemodel import BaseModel
+import numpy as np
 
 
 class Mambular(BaseModel):
@@ -52,21 +53,19 @@ class Mambular(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (cat_feature_info, num_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultMambularConfig = DefaultMambularConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
         self.returns_ensemble = False
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *feature_information,
             config=config,
         )
 
@@ -85,25 +84,23 @@ def __init__(
             self.perm = torch.randperm(self.embedding_layer.seq_len)
 
         # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        n_inputs = np.sum([len(info) for info in feature_information])
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         Tensor
             The output predictions of the model.
         """
-        x = self.embedding_layer(num_features, cat_features)
+        x = self.embedding_layer(*data)
 
         if self.hparams.shuffle_embeddings:
             x = x[:, self.perm, :]
diff --git a/mambular/base_models/mlp.py b/mambular/base_models/mlp.py
index 1a38871..94194d8 100644
--- a/mambular/base_models/mlp.py
+++ b/mambular/base_models/mlp.py
@@ -1,11 +1,10 @@
 import torch
 import torch.nn as nn
-
+import numpy as np
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..configs.mlp_config import DefaultMLPConfig
 from ..utils.get_feature_dimensions import get_feature_dimensions
 from .basemodel import BaseModel
-import numpy as np
 
 
 class MLP(BaseModel):
@@ -58,7 +57,7 @@ class MLP(BaseModel):
 
     def __init__(
         self,
-        feature_information: tuple,  # Expecting (cat_feature_info, num_feature_info, embedding_feature_info)
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultMLPConfig = DefaultMLPConfig(),  # noqa: B008
         **kwargs,
@@ -71,8 +70,6 @@ def __init__(
         # Initialize layers
         self.layers = nn.ModuleList()
 
-        input_dim = get_feature_dimensions(*feature_information)
-
         if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
                 *feature_information,
@@ -81,6 +78,8 @@ def __init__(
             input_dim = np.sum(
                 [len(info) * self.hparams.d_model for info in feature_information]
             )
+        else:
+            input_dim = get_feature_dimensions(*feature_information)
 
         # Input layer
         self.layers.append(nn.Linear(input_dim, self.hparams.layer_sizes[0]))
diff --git a/mambular/base_models/ndtf.py b/mambular/base_models/ndtf.py
index e279dc0..c750993 100644
--- a/mambular/base_models/ndtf.py
+++ b/mambular/base_models/ndtf.py
@@ -54,20 +54,17 @@ class NDTF(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultNDTFConfig = DefaultNDTFConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
         self.returns_ensemble = False
 
-        input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
+        input_dim = get_feature_dimensions(*feature_information)
 
         self.input_dimensions = [input_dim]
 
@@ -78,10 +75,13 @@ def __init__(
             [
                 NeuralDecisionTree(
                     input_dim=self.input_dimensions[idx],
-                    depth=np.random.randint(self.hparams.min_depth, self.hparams.max_depth),
+                    depth=np.random.randint(
+                        self.hparams.min_depth, self.hparams.max_depth
+                    ),
                     output_dim=num_classes,
                     lamda=self.hparams.lamda,
-                    temperature=self.hparams.temperature + np.abs(np.random.normal(0, 0.1)),
+                    temperature=self.hparams.temperature
+                    + np.abs(np.random.normal(0, 0.1)),
                     node_sampling=self.hparams.node_sampling,
                 )
                 for idx in range(self.hparams.n_ensembles)
@@ -103,21 +103,20 @@ def __init__(
             requires_grad=True,
         )
 
-    def forward(self, num_features, cat_features) -> torch.Tensor:
+    def forward(self, *data) -> torch.Tensor:
         """Forward pass of the NDTF model.
 
         Parameters
         ----------
-        x : torch.Tensor
-            Input tensor.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         torch.Tensor
             Output tensor.
         """
-        x = num_features + cat_features
-        x = torch.cat(x, dim=1)
+        x = torch.cat([t for tensors in data for t in tensors], dim=1)
         x = self.conv_layer(x.unsqueeze(2))
         x = x.transpose(1, 2).squeeze(-1)
 
@@ -131,21 +130,20 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
 
         return preds @ self.tree_weights
 
-    def penalty_forward(self, num_features, cat_features) -> torch.Tensor:
+    def penalty_forward(self, *data) -> torch.Tensor:
         """Forward pass of the NDTF model.
 
         Parameters
         ----------
-        x : torch.Tensor
-            Input tensor.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         torch.Tensor
             Output tensor.
         """
-        x = num_features + cat_features
-        x = torch.cat(x, dim=1)
+        x = torch.cat([t for tensors in data for t in tensors], dim=1)
         x = self.conv_layer(x.unsqueeze(2))
         x = x.transpose(1, 2).squeeze(-1)
 
diff --git a/mambular/base_models/node.py b/mambular/base_models/node.py
index 82cbf91..7010460 100644
--- a/mambular/base_models/node.py
+++ b/mambular/base_models/node.py
@@ -6,6 +6,7 @@
 from ..configs.node_config import DefaultNODEConfig
 from ..utils.get_feature_dimensions import get_feature_dimensions
 from .basemodel import BaseModel
+import numpy as np
 
 
 class NODE(BaseModel):
@@ -52,8 +53,7 @@ class NODE(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultNODEConfig = DefaultNODEConfig(),  # noqa: B008
         **kwargs,
@@ -63,16 +63,17 @@ def __init__(
 
         self.returns_ensemble = False
 
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-
         if self.hparams.use_embeddings:
-            input_dim = len(num_feature_info) * self.hparams.d_model + len(cat_feature_info) * self.hparams.d_model
-
-            self.embedding_layer = EmbeddingLayer(config)  # type: ignore
+            self.embedding_layer = EmbeddingLayer(
+                *feature_information,
+                config=config,
+            )
+            input_dim = np.sum(
+                [len(info) * self.hparams.d_model for info in feature_information]
+            )
 
         else:
-            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
+            input_dim = get_feature_dimensions(*feature_information)
 
         self.d_out = num_classes
         self.block = DenseBlock(
@@ -90,7 +91,7 @@ def __init__(
             output_dim=num_classes,
         )
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Forward pass through the NODE model.
 
         Parameters
@@ -106,12 +107,11 @@ def forward(self, num_features, cat_features):
             Model output of shape [batch_size, num_classes].
         """
         if self.hparams.use_embeddings:
-            x = self.embedding_layer(num_features, cat_features)
+            x = self.embedding_layer(*data)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
         else:
-            x = num_features + cat_features
-            x = torch.cat(x, dim=1)
+            x = torch.cat([t for tensors in data for t in tensors], dim=1)
 
         x = self.block(x).squeeze(-1)
         x = self.tabular_head(x)
diff --git a/mambular/base_models/resnet.py b/mambular/base_models/resnet.py
index a2e487e..2a383bc 100644
--- a/mambular/base_models/resnet.py
+++ b/mambular/base_models/resnet.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-
+import numpy as np
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.resnet_utils import ResidualBlock
 from ..configs.resnet_config import DefaultResNetConfig
@@ -56,30 +56,26 @@ class ResNet(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultResNetConfig = DefaultResNetConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         if self.hparams.use_embeddings:
-            input_dim = len(num_feature_info) * self.hparams.d_model + len(cat_feature_info) * self.hparams.d_model
-            # embedding layer
             self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
+                *feature_information,
                 config=config,
             )
-
+            input_dim = np.sum(
+                [len(info) * self.hparams.d_model for info in feature_information]
+            )
         else:
-            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
+            input_dim = get_feature_dimensions(*feature_information)
 
         self.initial_layer = nn.Linear(input_dim, self.hparams.layer_sizes[0])
 
@@ -102,14 +98,25 @@ def __init__(
 
         self.output_layer = nn.Linear(self.hparams.layer_sizes[-1], num_classes)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
+        """Forward pass of the ResNet model.
+
+        Parameters
+        ----------
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
+
+        Returns
+        -------
+        torch.Tensor
+            Output tensor.
+        """
         if self.hparams.use_embeddings:
-            x = self.embedding_layer(num_features, cat_features)
+            x = self.embedding_layer(*data)
             B, S, D = x.shape
             x = x.reshape(B, S * D)
         else:
-            x = num_features + cat_features
-            x = torch.cat(x, dim=1)
+            x = torch.cat([t for tensors in data for t in tensors], dim=1)
 
         x = self.initial_layer(x)
         for block in self.blocks:
diff --git a/mambular/base_models/saint.py b/mambular/base_models/saint.py
index 38847fa..e2c6738 100644
--- a/mambular/base_models/saint.py
+++ b/mambular/base_models/saint.py
@@ -50,25 +50,22 @@ class SAINT(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultSAINTConfig = DefaultSAINTConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+
+        n_inputs = np.sum([len(info) for info in feature_information])
         if getattr(config, "use_cls", True):
             n_inputs += 1
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *feature_information,
             config=config,
         )
 
@@ -89,22 +86,20 @@ def __init__(
 
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
-        Tensor
-            The output predictions of the model.
+        torch.Tensor
+            Output tensor.
         """
-        x = self.embedding_layer(num_features, cat_features)
+        x = self.embedding_layer(*data)
 
         x = self.encoder(x)
 
diff --git a/mambular/base_models/tabm.py b/mambular/base_models/tabm.py
index 7683b4b..ef6e605 100644
--- a/mambular/base_models/tabm.py
+++ b/mambular/base_models/tabm.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-
+import numpy as np
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.batch_ensemble_layer import LinearBatchEnsembleLayer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
@@ -11,10 +11,10 @@
 
 
 class TabM(BaseModel):
+
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes: int = 1,
         config: DefaultTabMConfig = DefaultTabMConfig(),  # noqa: B008
         **kwargs,
@@ -23,7 +23,7 @@ def __init__(
         super().__init__(config=config, **kwargs)
 
         # Save hparams including config attributes
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
         if not self.hparams.average_ensembles:
             self.returns_ensemble = True  # Directly set ensemble flag
         else:
@@ -35,18 +35,19 @@ def __init__(
         # Conditionally initialize EmbeddingLayer based on self.hparams
         if self.hparams.use_embeddings:
             self.embedding_layer = EmbeddingLayer(
-                num_feature_info=num_feature_info,
-                cat_feature_info=cat_feature_info,
+                *feature_information,
                 config=config,
             )
 
             if self.hparams.average_embeddings:
                 input_dim = self.hparams.d_model
             else:
-                input_dim = (len(num_feature_info) + len(cat_feature_info)) * config.d_model
+                input_dim = np.sum(
+                    [len(info) * self.hparams.d_model for info in feature_information]
+                )
 
         else:
-            input_dim = get_feature_dimensions(num_feature_info, cat_feature_info)
+            input_dim = get_feature_dimensions(*feature_information)
 
         # Input layer with batch ensembling
         self.layers.append(
@@ -71,7 +72,11 @@ def __init__(
         if self.hparams.use_glu:
             self.layers.append(nn.GLU())
         else:
-            self.layers.append(self.hparams.activation if hasattr(self.hparams, "activation") else nn.SELU())
+            self.layers.append(
+                self.hparams.activation
+                if hasattr(self.hparams, "activation")
+                else nn.SELU()
+            )
         if self.hparams.dropout > 0.0:
             self.layers.append(nn.Dropout(self.hparams.dropout))
 
@@ -105,7 +110,11 @@ def __init__(
             if self.hparams.use_glu:
                 self.layers.append(nn.GLU())
             else:
-                self.layers.append(self.hparams.activation if hasattr(self.hparams, "activation") else nn.SELU())
+                self.layers.append(
+                    self.hparams.activation
+                    if hasattr(self.hparams, "activation")
+                    else nn.SELU()
+                )
             if self.hparams.dropout > 0.0:
                 self.layers.append(nn.Dropout(self.hparams.dropout))
 
@@ -118,15 +127,13 @@ def __init__(
                 num_classes,
             )
 
-    def forward(self, num_features, cat_features) -> torch.Tensor:
+    def forward(self, *data) -> torch.Tensor:
         """Forward pass of the TabM model with batch ensembling.
 
         Parameters
         ----------
-        num_features : torch.Tensor
-            Numerical features tensor.
-        cat_features : torch.Tensor
-            Categorical features tensor.
+        data : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
@@ -135,7 +142,7 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
         """
         # Handle embeddings if used
         if self.hparams.use_embeddings:
-            x = self.embedding_layer(num_features, cat_features)
+            x = self.embedding_layer(*data)
             # Option 1: Average over feature dimension (N)
             if self.hparams.average_embeddings:
                 x = x.mean(dim=1)  # Shape: (B, D)
@@ -145,15 +152,18 @@ def forward(self, num_features, cat_features) -> torch.Tensor:
                 x = x.reshape(B, N * D)  # Shape: (B, N * D)
 
         else:
-            x = num_features + cat_features
-            x = torch.cat(x, dim=1)
+            x = torch.cat([t for tensors in data for t in tensors], dim=1)
 
         # Process through layers with optional skip connections
         for i in range(len(self.layers) - 1):
             if isinstance(self.layers[i], LinearBatchEnsembleLayer):
                 out = self.layers[i](x)
                 # `out` shape is expected to be (batch_size, ensemble_size, out_features)
-                if hasattr(self, "skip_connections") and self.skip_connections and x.shape == out.shape:
+                if (
+                    hasattr(self, "skip_connections")
+                    and self.skip_connections
+                    and x.shape == out.shape
+                ):
                     x = x + out
                 else:
                     x = out
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index df8104a..aee1f7a 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-
+import numpy as np
 from ..arch_utils.get_norm_fn import get_normalization_layer
 from ..arch_utils.layer_utils.embedding_layer import EmbeddingLayer
 from ..arch_utils.mlp_utils import MLPhead
@@ -61,14 +61,14 @@ class TabTransformer(BaseModel):
 
     def __init__(
         self,
-        cat_feature_info,
-        num_feature_info,
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultTabTransformerConfig = DefaultTabTransformerConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
+        num_feature_info, cat_feature_info, emb_feature_info = feature_information
         if cat_feature_info == {}:
             raise ValueError(
                 "You are trying to fit a TabTransformer with no categorical features. \
@@ -76,13 +76,10 @@ def __init__(
             )
 
         self.returns_ensemble = False
-        self.cat_feature_info = cat_feature_info
-        self.num_feature_info = num_feature_info
 
         # embedding layer
         self.embedding_layer = EmbeddingLayer(
-            num_feature_info=num_feature_info,
-            cat_feature_info=cat_feature_info,
+            *({}, cat_feature_info, emb_feature_info),
             config=config,
         )
 
@@ -107,25 +104,24 @@ def __init__(
         )
 
         # pooling
-        n_inputs = len(num_feature_info) + len(cat_feature_info)
+        n_inputs = n_inputs = [len(info) for info in feature_information]
         self.initialize_pooling_layers(config=config, n_inputs=n_inputs)
 
-    def forward(self, num_features, cat_features):
+    def forward(self, *data):
         """Defines the forward pass of the model.
 
         Parameters
         ----------
-        num_features : Tensor
-            Tensor containing the numerical features.
-        cat_features : Tensor
-            Tensor containing the categorical features.
+        ata : tuple
+            Input tuple of tensors of num_features, cat_features, embeddings.
 
         Returns
         -------
         Tensor
             The output predictions of the model.
         """
-        cat_embeddings = self.embedding_layer(None, cat_features)
+        num_features, cat_features, emb_features = data
+        cat_embeddings = self.embedding_layer(*(None, cat_features, emb_features))
 
         num_features = torch.cat(num_features, dim=1)
         num_embeddings = self.norm_f(num_features)  # type: ignore
diff --git a/mambular/base_models/tabularnn.py b/mambular/base_models/tabularnn.py
index 5699bf7..6ac5c3a 100644
--- a/mambular/base_models/tabularnn.py
+++ b/mambular/base_models/tabularnn.py
@@ -1,5 +1,4 @@
 from dataclasses import replace
-
 import torch
 import torch.nn as nn
 
@@ -15,13 +14,13 @@ class TabulaRNN(BaseModel):
 
     def __init__(
         self,
-        feature_information: tuple,  # Expecting (cat_feature_info, num_feature_info, embedding_feature_info)
+        feature_information: tuple,  # Expecting (num_feature_info, cat_feature_info, embedding_feature_info)
         num_classes=1,
         config: DefaultTabulaRNNConfig = DefaultTabulaRNNConfig(),  # noqa: B008
         **kwargs,
     ):
         super().__init__(config=config, **kwargs)
-        self.save_hyperparameters(ignore=["cat_feature_info", "num_feature_info"])
+        self.save_hyperparameters(ignore=["feature_information"])
 
         self.returns_ensemble = False
 

From a2c7845ee2fb82b42d729493520d249443b7eea5 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Fri, 24 Jan 2025 15:44:15 +0100
Subject: [PATCH 12/24] adapt lightning layer and preprocessor to account for
 no passed embeddings

---
 .../arch_utils/layer_utils/embedding_layer.py | 48 ++++++++++---------
 mambular/preprocessing/preprocessor.py        |  2 +-
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/embedding_layer.py b/mambular/arch_utils/layer_utils/embedding_layer.py
index 0184ca3..6098adb 100644
--- a/mambular/arch_utils/layer_utils/embedding_layer.py
+++ b/mambular/arch_utils/layer_utils/embedding_layer.py
@@ -101,20 +101,21 @@ def __init__(self, num_feature_info, cat_feature_info, emb_feature_info, config)
             ]
         )
 
-        if self.embedding_projection:
-            self.emb_embeddings = nn.ModuleList(
-                [
-                    nn.Sequential(
-                        nn.Linear(
-                            feature_info["dimension"],
-                            self.d_model,
-                            bias=self.embedding_bias,
-                        ),
-                        self.embedding_activation,
-                    )
-                    for feature_name, feature_info in emb_feature_info.items()
-                ]
-            )
+        if len(emb_feature_info) >= 1:
+            if self.embedding_projection:
+                self.emb_embeddings = nn.ModuleList(
+                    [
+                        nn.Sequential(
+                            nn.Linear(
+                                feature_info["dimension"],
+                                self.d_model,
+                                bias=self.embedding_bias,
+                            ),
+                            self.embedding_activation,
+                        )
+                        for feature_name, feature_info in emb_feature_info.items()
+                    ]
+                )
 
         # Class token if required
         if self.use_cls:
@@ -181,15 +182,16 @@ def forward(self, num_features, cat_features, emb_features):
                 if self.layer_norm_after_embedding:
                     num_embeddings = self.embedding_norm(num_embeddings)
 
-        if self.embedding_projection:
-            emb_embeddings = [
-                emb(emb_features[i]) for i, emb in enumerate(self.emb_embeddings)
-            ]
-            emb_embeddings = torch.stack(emb_embeddings, dim=1)
-        else:
-            emb_embeddings = torch.stack(emb_features, dim=1)
-        if self.layer_norm_after_embedding:
-            emb_embeddings = self.embedding_norm(emb_embeddings)
+        if emb_features != []:
+            if self.embedding_projection:
+                emb_embeddings = [
+                    emb(emb_features[i]) for i, emb in enumerate(self.emb_embeddings)
+                ]
+                emb_embeddings = torch.stack(emb_embeddings, dim=1)
+            else:
+                emb_embeddings = torch.stack(emb_features, dim=1)
+            if self.layer_norm_after_embedding:
+                emb_embeddings = self.embedding_norm(emb_embeddings)
 
         embeddings = [
             e for e in [cat_embeddings, num_embeddings, emb_embeddings] if e is not None
diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index 0fa7340..99b4123 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -727,7 +727,7 @@ def get_feature_info(self, verbose=True):
                     "categories": None,
                 }
         else:
-            embedding_feature_info = None
+            embedding_feature_info = {}
 
         if not self.column_transformer:
             raise RuntimeError("The preprocessor has not been fitted yet.")

From b8bc5e977bdcf27ba8f717f62cfbafe3532f7d08 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:11:46 +0100
Subject: [PATCH 13/24] restructure configs to create parent config-class

---
 mambular/configs/base_config.py           | 83 +++++++++++++++++++++++
 mambular/configs/fttransformer_config.py  | 42 +-----------
 mambular/configs/mambatab_config.py       | 38 +----------
 mambular/configs/mambattention_config.py  | 44 +-----------
 mambular/configs/mambular_config.py       | 45 ++----------
 mambular/configs/mlp_config.py            | 57 +---------------
 mambular/configs/ndtf_config.py           | 15 +---
 mambular/configs/node_config.py           | 48 +------------
 mambular/configs/resnet_config.py         | 50 +-------------
 mambular/configs/saint_config.py          | 51 ++------------
 mambular/configs/tabm_config.py           | 60 +---------------
 mambular/configs/tabtransformer_config.py | 42 ++----------
 mambular/configs/tabularnn_config.py      | 46 ++-----------
 13 files changed, 120 insertions(+), 501 deletions(-)
 create mode 100644 mambular/configs/base_config.py

diff --git a/mambular/configs/base_config.py b/mambular/configs/base_config.py
new file mode 100644
index 0000000..0e5a639
--- /dev/null
+++ b/mambular/configs/base_config.py
@@ -0,0 +1,83 @@
+from dataclasses import dataclass, field
+from collections.abc import Callable
+import torch.nn as nn
+
+
+@dataclass
+class BaseConfig:
+    """
+    Base configuration class with shared hyperparameters for models.
+
+    This configuration class provides common hyperparameters for optimization,
+    embeddings, and categorical encoding, which can be inherited by specific
+    model configurations.
+
+    Parameters
+    ----------
+    lr : float, default=1e-04
+        Learning rate for the optimizer.
+    lr_patience : int, default=10
+        Number of epochs with no improvement before reducing the learning rate.
+    weight_decay : float, default=1e-06
+        L2 regularization parameter for weight decay in the optimizer.
+    lr_factor : float, default=0.1
+        Factor by which the learning rate is reduced when patience is exceeded.
+    activation : Callable, default=nn.ReLU()
+        Activation function to use in the model's layers.
+    cat_encoding : str, default="int"
+        Method for encoding categorical features ('int', 'one-hot', or 'linear').
+
+    Embedding Parameters
+    --------------------
+    use_embeddings : bool, default=False
+        Whether to use embeddings for categorical or numerical features.
+    embedding_activation : Callable, default=nn.Identity()
+        Activation function applied to embeddings.
+    embedding_type : str, default="linear"
+        Type of embedding to use ('linear', 'plr', etc.).
+    embedding_bias : bool, default=False
+        Whether to use bias in embedding layers.
+    layer_norm_after_embedding : bool, default=False
+        Whether to apply layer normalization after embedding layers.
+    d_model : int, default=32
+        Dimensionality of embeddings or model representations.
+    plr_lite : bool, default=False
+        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
+    n_frequencies : int, default=48
+        Number of frequency components for embeddings.
+    frequencies_init_scale : float, default=0.01
+        Initial scale for frequency components in embeddings.
+    embedding_projection : bool, default=True
+        Whether to apply a projection layer after embeddings.
+
+    Notes
+    -----
+    - This base class is meant to be inherited by other configurations.
+    - Provides default values that can be overridden in derived configurations.
+
+    """
+
+    # Training Parameters
+    lr: float = 1e-04
+    lr_patience: int = 10
+    weight_decay: float = 1e-06
+    lr_factor: float = 0.1
+
+    # Embedding Parameters
+    use_embeddings: bool = False
+    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
+    embedding_type: str = "linear"
+    embedding_bias: bool = False
+    layer_norm_after_embedding: bool = False
+    d_model: int = 32
+    plr_lite: bool = False
+    n_frequencies: int = 48
+    frequencies_init_scale: float = 0.01
+    embedding_projection: bool = True
+
+    # Architecture Parameters
+    batch_norm: bool = False
+    layer_norm: bool = False
+    layer_norm_eps: float = 1e-05
+    activation: Callable = nn.ReLU()  # noqa: RUF009
+    cat_encoding: str = "int"
diff --git a/mambular/configs/fttransformer_config.py b/mambular/configs/fttransformer_config.py
index d6aa11d..37bdcf4 100644
--- a/mambular/configs/fttransformer_config.py
+++ b/mambular/configs/fttransformer_config.py
@@ -1,25 +1,16 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
-
 from ..arch_utils.transformer_utils import ReGLU
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultFTTransformerConfig:
+class DefaultFTTransformerConfig(BaseConfig):
     """Configuration class for the FT Transformer model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     d_model : int, default=128
         Dimensionality of the transformer model.
     n_layers : int, default=4
@@ -44,20 +35,6 @@ class DefaultFTTransformerConfig:
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
         Whether to use bias in linear layers.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    embedding_bias : bool, default=False
-        Whether to use bias in embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
     head_layer_sizes : list, default=()
         Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
@@ -76,12 +53,6 @@ class DefaultFTTransformerConfig:
         Method for encoding categorical features ('int', 'one-hot', or 'linear').
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     d_model: int = 128
     n_layers: int = 4
@@ -96,15 +67,6 @@ class DefaultFTTransformerConfig:
     norm_first: bool = False
     bias: bool = True
 
-    # Embedding Parameters
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
-
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
     head_dropout: float = 0.5
diff --git a/mambular/configs/mambatab_config.py b/mambular/configs/mambatab_config.py
index c00d4ba..ccfe459 100644
--- a/mambular/configs/mambatab_config.py
+++ b/mambular/configs/mambatab_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultMambaTabConfig:
+class DefaultMambaTabConfig(BaseConfig):
     """Configuration class for the Default MambaTab model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     d_model : int, default=64
         Dimensionality of the model.
     n_layers : int, default=1
@@ -50,18 +42,6 @@ class DefaultMambaTabConfig:
         Activation function for the model.
     axis : int, default=1
         Axis along which operations are applied, if applicable.
-    num_embedding_activation : callable, default=nn.ReLU()
-        Activation function for numerical embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
     head_layer_sizes : list, default=()
         Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.0
@@ -82,12 +62,6 @@ class DefaultMambaTabConfig:
         Whether to process data bidirectionally.
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     d_model: int = 64
     n_layers: int = 1
@@ -106,14 +80,6 @@ class DefaultMambaTabConfig:
     activation: Callable = nn.ReLU()  # noqa: RUF009
     axis: int = 1
 
-    # Embedding Parameters
-    num_embedding_activation: Callable = nn.ReLU()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
     head_dropout: float = 0.0
diff --git a/mambular/configs/mambattention_config.py b/mambular/configs/mambattention_config.py
index b1f029a..49e596e 100644
--- a/mambular/configs/mambattention_config.py
+++ b/mambular/configs/mambattention_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultMambAttentionConfig:
+class DefaultMambAttentionConfig(BaseConfig):
     """Configuration class for the Default Mambular Attention model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     d_model : int, default=64
         Dimensionality of the model.
     n_layers : int, default=4
@@ -58,22 +50,6 @@ class DefaultMambAttentionConfig:
         Type of normalization used in the model.
     activation : callable, default=nn.SiLU()
         Activation function for the model.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    num_embedding_activation : callable, default=nn.ReLU()
-        Activation function for numerical embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
     head_layer_sizes : list, default=()
         Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
@@ -106,12 +82,6 @@ class DefaultMambAttentionConfig:
         Number of attention layers in the model.
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     d_model: int = 64
     n_layers: int = 4
@@ -133,16 +103,6 @@ class DefaultMambAttentionConfig:
     dt_init_floor: float = 1e-04
     norm: str = "LayerNorm"
     activation: Callable = nn.SiLU()  # noqa: RUF009
-    layer_norm_eps: float = 1e-05
-
-    # Embedding Parameters
-    num_embedding_activation: Callable = nn.ReLU()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    layer_norm_after_embedding: bool = False
 
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index fcebca0..8bc2f90 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultMambularConfig:
+class DefaultMambularConfig(BaseConfig):
     """Configuration class for the Default Mambular model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     d_model : int, default=64
         Dimensionality of the model.
     n_layers : int, default=4
@@ -28,6 +20,8 @@ class DefaultMambularConfig:
         Whether to use bias in the linear layers.
     dropout : float, default=0.0
         Dropout rate for regularization.
+    d_conv : int, default=4
+        Size of convolution over columns.
     dt_rank : str, default="auto"
         Rank of the decision tree used in the model.
     d_state : int, default=128
@@ -46,22 +40,6 @@ class DefaultMambularConfig:
         Type of normalization used ('RMSNorm', etc.).
     activation : callable, default=nn.SiLU()
         Activation function for the model.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
     shuffle_embeddings : bool, default=False
         Whether to shuffle embeddings before being passed to Mamba layers.
     head_layer_sizes : list, default=()
@@ -88,15 +66,10 @@ class DefaultMambularConfig:
         Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     d_model: int = 64
     n_layers: int = 4
+    d_conv: int = 4
     expand_factor: int = 2
     bias: bool = False
     dropout: float = 0.0
@@ -109,16 +82,8 @@ class DefaultMambularConfig:
     dt_init_floor: float = 1e-04
     norm: str = "RMSNorm"
     activation: Callable = nn.SiLU()  # noqa: RUF009
-    layer_norm_eps: float = 1e-05
 
     # Embedding Parameters
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    layer_norm_after_embedding: bool = False
     shuffle_embeddings: bool = False
 
     # Head Parameters
diff --git a/mambular/configs/mlp_config.py b/mambular/configs/mlp_config.py
index 08711be..1dda45f 100644
--- a/mambular/configs/mlp_config.py
+++ b/mambular/configs/mlp_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultMLPConfig:
+class DefaultMLPConfig(BaseConfig):
     """Configuration class for the default Multi-Layer Perceptron (MLP) model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     layer_sizes : list, default=(256, 128, 32)
         Sizes of the layers in the MLP.
     activation : callable, default=nn.ReLU()
@@ -30,38 +22,8 @@ class DefaultMLPConfig:
         Whether to use Gated Linear Units (GLU) in the MLP.
     skip_connections : bool, default=False
         Whether to use skip connections in the MLP.
-    batch_norm : bool, default=False
-        Whether to use batch normalization in the MLP layers.
-    layer_norm : bool, default=False
-        Whether to use layer normalization in the MLP layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for all features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
-    d_model : int, default=32
-        Dimensionality of the embeddings.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     layer_sizes: list = field(default_factory=lambda: [256, 128, 32])
     activation: Callable = nn.ReLU()  # noqa: RUF009
@@ -69,18 +31,3 @@ class DefaultMLPConfig:
     dropout: float = 0.2
     use_glu: bool = False
     skip_connections: bool = False
-    batch_norm: bool = False
-    layer_norm: bool = False
-    layer_norm_eps: float = 1e-05
-
-    # Embedding Parameters
-    use_embeddings: bool = False
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
-    d_model: int = 32
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    embedding_projection: bool = True
diff --git a/mambular/configs/ndtf_config.py b/mambular/configs/ndtf_config.py
index 89fad29..1fa1eec 100644
--- a/mambular/configs/ndtf_config.py
+++ b/mambular/configs/ndtf_config.py
@@ -1,20 +1,13 @@
 from dataclasses import dataclass
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultNDTFConfig:
+class DefaultNDTFConfig(BaseConfig):
     """Configuration class for the default Neural Decision Tree Forest (NDTF) model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) applied to the model's weights during optimization.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced when a plateau is reached.
     min_depth : int, default=2
         Minimum depth of trees in the forest. Controls the simplest model structure.
     max_depth : int, default=10
@@ -33,10 +26,6 @@ class DefaultNDTFConfig:
         Factor with which the penalty is multiplied
     """
 
-    lr: float = 1e-4
-    lr_patience: int = 5
-    weight_decay: float = 1e-7
-    lr_factor: float = 0.1
     min_depth: int = 4
     max_depth: int = 16
     temperature: float = 0.1
diff --git a/mambular/configs/node_config.py b/mambular/configs/node_config.py
index 82a4bda..2c93d30 100644
--- a/mambular/configs/node_config.py
+++ b/mambular/configs/node_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultNODEConfig:
+class DefaultNODEConfig(BaseConfig):
     """Configuration class for the Neural Oblivious Decision Ensemble (NODE) model.
 
     Parameters
     ----------
-    lr : float, default=1e-03
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs without improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization penalty) applied by the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate is reduced when there is no improvement.
     num_layers : int, default=4
         Number of dense layers in the model.
     layer_dim : int, default=128
@@ -28,24 +20,6 @@ class DefaultNODEConfig:
         Depth of each decision tree in the ensemble.
     norm : str, default=None
         Type of normalization to use in the model.
-    use_embeddings : bool, default=False
-        Whether to use embedding layers for categorical features.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function to apply to embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
-    d_model : int, default=32
-        Dimensionality of the embedding space.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
     head_layer_sizes : list, default=()
         Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
@@ -58,31 +32,13 @@ class DefaultNODEConfig:
         Whether to use batch normalization in the head layers.
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-03
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
     num_layers: int = 4
     layer_dim: int = 128
     tree_dim: int = 1
     depth: int = 6
-
     norm: str | None = None
 
-    # Embedding Parameters
-    use_embeddings: bool = False
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
-    d_model: int = 32
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
     head_dropout: float = 0.5
diff --git a/mambular/configs/resnet_config.py b/mambular/configs/resnet_config.py
index e904957..7a458d5 100644
--- a/mambular/configs/resnet_config.py
+++ b/mambular/configs/resnet_config.py
@@ -1,23 +1,15 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultResNetConfig:
+class DefaultResNetConfig(BaseConfig):
     """Configuration class for the default ResNet model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization penalty) applied by the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate is reduced when there is no improvement.
     layer_sizes : list, default=(256, 128, 32)
         Sizes of the layers in the ResNet.
     activation : callable, default=nn.SELU()
@@ -32,36 +24,13 @@ class DefaultResNetConfig:
         Whether to use Gated Linear Units (GLU) in the ResNet.
     skip_connections : bool, default=True
         Whether to use skip connections in the ResNet.
-    batch_norm : bool, default=True
-        Whether to use batch normalization in the ResNet layers.
-    layer_norm : bool, default=False
-        Whether to use layer normalization in the ResNet layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
     num_blocks : int, default=3
         Number of residual blocks in the ResNet.
-    use_embeddings : bool, default=True
-        Whether to use embedding layers for all features.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
     average_embeddings : bool, default=True
         Whether to average embeddings during the forward pass.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
-    d_model : int, default=64
-        Dimensionality of the embeddings.
     """
 
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
+    # model params
     layer_sizes: list = field(default_factory=lambda: [256, 128, 32])
     activation: Callable = nn.SELU()  # noqa: RUF009
     skip_layers: bool = False
@@ -69,20 +38,7 @@ class DefaultResNetConfig:
     norm: bool = False
     use_glu: bool = False
     skip_connections: bool = True
-    batch_norm: bool = True
-    layer_norm: bool = False
-    layer_norm_eps: float = 1e-05
     num_blocks: int = 3
 
     # embedding params
-    use_embeddings: bool = True
-    embedding_type: str = "linear"
-    embedding_bias = False
-    plr_lite: bool = False
     average_embeddings: bool = True
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    layer_norm_after_embedding: bool = False
-    d_model: int = 64
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
diff --git a/mambular/configs/saint_config.py b/mambular/configs/saint_config.py
index 6c166cb..3e90369 100644
--- a/mambular/configs/saint_config.py
+++ b/mambular/configs/saint_config.py
@@ -1,29 +1,21 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultSAINTConfig:
+class DefaultSAINTConfig(BaseConfig):
     """Configuration class for the SAINT model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 regularization) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=128
-        Dimensionality of the transformer model.
     n_layers : int, default=4
         Number of transformer layers.
     n_heads : int, default=8
         Number of attention heads in the transformer.
+    d_model : int, default=128
+        Dimensionality of embeddings or model representations.
     attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
     ff_dropout : float, default=0.1
@@ -36,26 +28,10 @@ class DefaultSAINTConfig:
         Activation function for the transformer feed-forward layers.
     transformer_dim_feedforward : int, default=256
         Dimensionality of the feed-forward layers in the transformer.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization to improve numerical stability.
     norm_first : bool, default=False
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
         Whether to use bias in linear layers.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', 'plr', etc.).
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    embedding_bias : bool, default=False
-        Whether to use bias in embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
     head_layer_sizes : list, default=()
         Sizes of the fully connected layers in the model's head.
     head_dropout : float, default=0.5
@@ -74,32 +50,17 @@ class DefaultSAINTConfig:
         Method for encoding categorical features ('int', 'one-hot', or 'linear').
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
-    d_model: int = 32
+
     n_layers: int = 1
     n_heads: int = 2
     attn_dropout: float = 0.2
     ff_dropout: float = 0.1
     norm: str = "LayerNorm"
     activation: Callable = nn.GELU()  # noqa: RUF009
-    layer_norm_eps: float = 1e-05
     norm_first: bool = False
     bias: bool = True
-
-    # Embedding Parameters
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
+    d_model: int = 128
 
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
diff --git a/mambular/configs/tabm_config.py b/mambular/configs/tabm_config.py
index ee52dc8..4c4a931 100644
--- a/mambular/configs/tabm_config.py
+++ b/mambular/configs/tabm_config.py
@@ -1,24 +1,16 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from typing import Literal
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultTabMConfig:
+class DefaultTabMConfig(BaseConfig):
     """Configuration class for the TabM model with batch ensembling and predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which the learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate is reduced when there is no improvement.
     layer_sizes : list, default=(512, 512, 128)
         Sizes of the layers in the model.
     activation : callable, default=nn.ReLU()
@@ -29,32 +21,6 @@ class DefaultTabMConfig:
         Normalization method to be used, if any.
     use_glu : bool, default=False
         Whether to use Gated Linear Units (GLU) in the model.
-    batch_norm : bool, default=False
-        Whether to use batch normalization in the model layers.
-    layer_norm : bool, default=False
-        Whether to use layer normalization in the model layers.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
-    use_embeddings : bool, default=True
-        Whether to use embedding layers for all features.
-    embedding_type : str, default="plr"
-        Type of embedding to use ('plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    average_embeddings : bool, default=False
-        Whether to average embeddings during the forward pass.
-    embedding_activation : callable, default=nn.ReLU()
-        Activation function for embeddings.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
-    d_model : int, default=64
-        Dimensionality of the embeddings.
     ensemble_size : int, default=32
         Number of ensemble members for batch ensembling.
     ensemble_scaling_in : bool, default=True
@@ -71,34 +37,12 @@ class DefaultTabMConfig:
         Model type to use ('mini' for reduced version, 'full' for complete model).
     """
 
-    # lr params
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-05
-    lr_factor: float = 0.1
-
     # arch params
     layer_sizes: list = field(default_factory=lambda: [256, 256, 128])
     activation: Callable = nn.ReLU()  # noqa: RUF009
     dropout: float = 0.5
     norm: str | None = None
     use_glu: bool = False
-    batch_norm: bool = False
-    layer_norm: bool = False
-    layer_norm_eps: float = 1e-05
-
-    # embedding params
-    use_embeddings: bool = True
-    embedding_type: str = "linear"
-    embedding_bias = False
-    plr_lite: bool = False
-    average_embeddings: bool = False
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    layer_norm_after_embedding: bool = False
-    d_model: int = 32
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
 
     # Batch ensembling specific configurations
     ensemble_size: int = 32
diff --git a/mambular/configs/tabtransformer_config.py b/mambular/configs/tabtransformer_config.py
index 3cdea5c..84f16c9 100644
--- a/mambular/configs/tabtransformer_config.py
+++ b/mambular/configs/tabtransformer_config.py
@@ -1,31 +1,22 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
-
 from ..arch_utils.transformer_utils import ReGLU
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultTabTransformerConfig:
+class DefaultTabTransformerConfig(BaseConfig):
     """Configuration class for the default Tab Transformer model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
-    d_model : int, default=128
-        Dimensionality of the model.
     n_layers : int, default=4
         Number of layers in the transformer.
     n_heads : int, default=8
         Number of attention heads in the transformer.
+    d_model : int, default=128
+        Dimensionality of embeddings or model representations.
     attn_dropout : float, default=0.2
         Dropout rate for the attention mechanism.
     ff_dropout : float, default=0.1
@@ -38,20 +29,10 @@ class DefaultTabTransformerConfig:
         Activation function for the transformer layers.
     transformer_dim_feedforward : int, default=512
         Dimensionality of the feed-forward layers in the transformer.
-    layer_norm_eps : float, default=1e-05
-        Epsilon value for layer normalization.
     norm_first : bool, default=True
         Whether to apply normalization before other operations in each transformer block.
     bias : bool, default=True
         Whether to use bias in the linear layers.
-    embedding_activation : callable, default=nn.Identity()
-        Activation function for embeddings.
-    embedding_type : str, default="linear"
-        Type of embedding to use ('linear', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding.
     head_layer_sizes : list, default=()
         Sizes of the layers in the model's head.
     head_dropout : float, default=0.5
@@ -68,14 +49,7 @@ class DefaultTabTransformerConfig:
         Encoding method for categorical features ('int', 'one-hot', etc.).
     """
 
-    # Optimizer Parameters
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture Parameters
-    d_model: int = 128
     n_layers: int = 4
     n_heads: int = 8
     attn_dropout: float = 0.2
@@ -84,15 +58,9 @@ class DefaultTabTransformerConfig:
     activation: Callable = nn.SELU()  # noqa: RUF009
     transformer_activation: Callable = ReGLU()  # noqa: RUF009
     transformer_dim_feedforward: int = 512
-    layer_norm_eps: float = 1e-05
     norm_first: bool = True
     bias: bool = True
-
-    # Embedding Parameters
-    embedding_activation: Callable = nn.Identity()  # noqa: RUF009
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    layer_norm_after_embedding: bool = False
+    d_model: int = 128
 
     # Head Parameters
     head_layer_sizes: list = field(default_factory=list)
diff --git a/mambular/configs/tabularnn_config.py b/mambular/configs/tabularnn_config.py
index f945fbe..84a9a99 100644
--- a/mambular/configs/tabularnn_config.py
+++ b/mambular/configs/tabularnn_config.py
@@ -1,51 +1,29 @@
 from collections.abc import Callable
 from dataclasses import dataclass, field
-
 import torch.nn as nn
+from .base_config import BaseConfig
 
 
 @dataclass
-class DefaultTabulaRNNConfig:
+class DefaultTabulaRNNConfig(BaseConfig):
     """Configuration class for the TabulaRNN model with predefined hyperparameters.
 
     Parameters
     ----------
-    lr : float, default=1e-04
-        Learning rate for the optimizer.
-    lr_patience : int, default=10
-        Number of epochs with no improvement after which learning rate will be reduced.
-    weight_decay : float, default=1e-06
-        Weight decay (L2 penalty) for the optimizer.
-    lr_factor : float, default=0.1
-        Factor by which the learning rate will be reduced.
     model_type : str, default="RNN"
         Type of model, one of "RNN", "LSTM", "GRU", "mLSTM", "sLSTM".
-    d_model : int, default=128
-        Dimensionality of the model.
     n_layers : int, default=4
         Number of layers in the RNN.
     rnn_dropout : float, default=0.2
         Dropout rate for the RNN layers.
+    d_model : int, default=128
+        Dimensionality of embeddings or model representations.
     norm : str, default="RMSNorm"
         Normalization method to be used.
     activation : callable, default=nn.SELU()
         Activation function for the RNN layers.
     residuals : bool, default=False
         Whether to include residual connections in the RNN.
-    embedding_type : str, default="linear"
-        Type of embedding for features ('linear', 'plr', etc.).
-    embedding_bias : bool, default=False
-        Whether to use bias in the embedding layers.
-    plr_lite : bool, default=False
-        Whether to use a lightweight version of Piecewise Linear Regression (PLR).
-    n_frequencies : int, default=48
-        Number of frequencies for PLR embeddings.
-    frequencies_init_scale : float, default=0.01
-        Initial scale for frequency parameters in embeddings.
-    embedding_activation : callable, default=nn.ReLU()
-        Activation function for embeddings.
-    layer_norm_after_embedding : bool, default=False
-        Whether to apply layer normalization after embedding layers.
     head_layer_sizes : list, default=()
         Sizes of the layers in the head of the model.
     head_dropout : float, default=0.5
@@ -74,12 +52,6 @@ class DefaultTabulaRNNConfig:
         Whether to use bias in the convolutional layers.
     """
 
-    # Optimizer params
-    lr: float = 1e-04
-    lr_patience: int = 10
-    weight_decay: float = 1e-06
-    lr_factor: float = 0.1
-
     # Architecture params
     model_type: str = "RNN"
     d_model: int = 128
@@ -89,16 +61,6 @@ class DefaultTabulaRNNConfig:
     activation: Callable = nn.SELU()  # noqa: RUF009
     residuals: bool = False
 
-    # Embedding params
-    embedding_type: str = "linear"
-    embedding_bias: bool = False
-    plr_lite: bool = False
-    n_frequencies: int = 48
-    frequencies_init_scale: float = 0.01
-    embedding_activation: Callable = nn.ReLU()  # noqa: RUF009
-    layer_norm_after_embedding: bool = False
-    embedding_projection: bool = True
-
     # Head params
     head_layer_sizes: list = field(default_factory=list)
     head_dropout: float = 0.5

From a4c5992a453598a3fbbd8fd16c1e81c84bf82ae7 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:12:05 +0100
Subject: [PATCH 14/24] fix minor bugs related to imports and dim
 identification

---
 mambular/base_models/mambatab.py       | 2 +-
 mambular/base_models/saint.py          | 1 +
 mambular/base_models/tabtransformer.py | 4 ++--
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/mambular/base_models/mambatab.py b/mambular/base_models/mambatab.py
index b1b111a..4314bab 100644
--- a/mambular/base_models/mambatab.py
+++ b/mambular/base_models/mambatab.py
@@ -72,7 +72,7 @@ def __init__(
         self.initial_layer = nn.Linear(input_dim, config.d_model)
         self.norm_f = LayerNorm(config.d_model)
 
-        self.embedding_activation = self.hparams.num_embedding_activation
+        self.embedding_activation = self.hparams.embedding_activation
 
         self.axis = config.axis
 
diff --git a/mambular/base_models/saint.py b/mambular/base_models/saint.py
index e2c6738..e6cfe19 100644
--- a/mambular/base_models/saint.py
+++ b/mambular/base_models/saint.py
@@ -4,6 +4,7 @@
 from ..arch_utils.transformer_utils import RowColTransformer
 from ..configs.saint_config import DefaultSAINTConfig
 from .basemodel import BaseModel
+import numpy as np
 
 
 class SAINT(BaseModel):
diff --git a/mambular/base_models/tabtransformer.py b/mambular/base_models/tabtransformer.py
index aee1f7a..0287203 100644
--- a/mambular/base_models/tabtransformer.py
+++ b/mambular/base_models/tabtransformer.py
@@ -93,8 +93,8 @@ def __init__(
         )
 
         mlp_input_dim = 0
-        for feature_name, input_shape in num_feature_info.items():
-            mlp_input_dim += input_shape
+        for feature_name, info in num_feature_info.items():
+            mlp_input_dim += info["dimension"]
         mlp_input_dim += self.hparams.d_model
 
         self.tabular_head = MLPhead(

From 6fc04ebe4d8235a515e998e3707eb6c5a21bd017 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:12:24 +0100
Subject: [PATCH 15/24] fix bug related to column names in datamodule - turn
 int to string

---
 mambular/data_utils/datamodule.py | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/mambular/data_utils/datamodule.py b/mambular/data_utils/datamodule.py
index 459b8c1..40e80f7 100644
--- a/mambular/data_utils/datamodule.py
+++ b/mambular/data_utils/datamodule.py
@@ -212,8 +212,8 @@ def setup(self, stage: str):
                     else torch.long
                 )
 
-                cat_key = (
-                    "cat_" + key
+                cat_key = "cat_" + str(
+                    key
                 )  # Assuming categorical keys are prefixed with 'cat_'
                 if cat_key in train_preprocessed_data:
                     train_cat_tensors.append(
@@ -224,7 +224,7 @@ def setup(self, stage: str):
                         torch.tensor(val_preprocessed_data[cat_key], dtype=dtype)
                     )
 
-                binned_key = "num_" + key  # for binned features
+                binned_key = "num_" + str(key)  # for binned features
                 if binned_key in train_preprocessed_data:
                     train_cat_tensors.append(
                         torch.tensor(train_preprocessed_data[binned_key], dtype=dtype)
@@ -237,8 +237,8 @@ def setup(self, stage: str):
 
             # Populate tensors for numerical features, if present in processed data
             for key in self.num_feature_info:  # type: ignore
-                num_key = (
-                    "num_" + key
+                num_key = "num_" + str(
+                    key
                 )  # Assuming numerical keys are prefixed with 'num_'
                 if num_key in train_preprocessed_data:
                     train_num_tensors.append(
@@ -306,13 +306,15 @@ def preprocess_new_data(self, X, embeddings):
                 )
                 else torch.long
             )
-            cat_key = "cat_" + key  # Assuming categorical keys are prefixed with 'cat_'
+            cat_key = "cat_" + str(
+                key
+            )  # Assuming categorical keys are prefixed with 'cat_'
             if cat_key in preprocessed_data:
                 cat_tensors.append(
                     torch.tensor(preprocessed_data[cat_key], dtype=dtype)
                 )
 
-            binned_key = "num_" + key  # for binned features
+            binned_key = "num_" + str(key)  # for binned features
             if binned_key in preprocessed_data:
                 cat_tensors.append(
                     torch.tensor(preprocessed_data[binned_key], dtype=dtype)
@@ -320,7 +322,9 @@ def preprocess_new_data(self, X, embeddings):
 
         # Populate tensors for numerical features, if present in processed data
         for key in self.num_feature_info:  # type: ignore
-            num_key = "num_" + key  # Assuming numerical keys are prefixed with 'num_'
+            num_key = "num_" + str(
+                key
+            )  # Assuming numerical keys are prefixed with 'num_'
             if num_key in preprocessed_data:
                 num_tensors.append(
                     torch.tensor(preprocessed_data[num_key], dtype=torch.float32)

From e60dd80ef168febf8aa46a8d7f3ecb87c600991b Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:12:44 +0100
Subject: [PATCH 16/24] make box-cox strictly positive

---
 mambular/preprocessing/preprocessor.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mambular/preprocessing/preprocessor.py b/mambular/preprocessing/preprocessor.py
index 99b4123..cbb9f2f 100644
--- a/mambular/preprocessing/preprocessor.py
+++ b/mambular/preprocessing/preprocessor.py
@@ -466,6 +466,9 @@ def fit(self, X, y=None, embeddings=None):
                     )
 
                 elif self.numerical_preprocessing == "box-cox":
+                    numeric_transformer_steps.append(
+                        ("check_positive", MinMaxScaler(feature_range=(1e-3, 1)))
+                    )
                     numeric_transformer_steps.append(
                         (
                             "box-cox",
@@ -752,11 +755,12 @@ def get_feature_info(self, verbose=True):
                         "quantile",
                         "polynomial",
                         "splines",
+                        "box-cox",
                     ]
                 ):
                     last_step = transformer_pipeline.steps[-1][1]
                     if hasattr(last_step, "transform"):
-                        dummy_input = np.zeros((1, 1))
+                        dummy_input = np.zeros((1, 1)) + 1e-05
                         transformed_feature = last_step.transform(dummy_input)
                         dimension = transformed_feature.shape[1]
                     numerical_feature_info[feature_name] = {

From febf1651851040dc426ace45fd52d5c4aa679ccc Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:12:54 +0100
Subject: [PATCH 17/24] include unit tests

---
 tests/test_base.py          | 155 ++++++++++++++++++
 tests/test_classifier.py    | 115 -------------
 tests/test_configs.py       | 115 +++++++++++++
 tests/test_distributions.py | 311 ------------------------------------
 tests/test_lss.py           | 104 ------------
 tests/test_preprocessor.py  | 185 ++++++++++++---------
 tests/test_regressor.py     | 103 ------------
 7 files changed, 377 insertions(+), 711 deletions(-)
 create mode 100644 tests/test_base.py
 delete mode 100644 tests/test_classifier.py
 create mode 100644 tests/test_configs.py
 delete mode 100644 tests/test_distributions.py
 delete mode 100644 tests/test_lss.py
 delete mode 100644 tests/test_regressor.py

diff --git a/tests/test_base.py b/tests/test_base.py
new file mode 100644
index 0000000..20d9797
--- /dev/null
+++ b/tests/test_base.py
@@ -0,0 +1,155 @@
+import pytest
+import inspect
+import torch
+import os
+import importlib
+from mambular.base_models.basemodel import BaseModel
+
+# Paths for models and configs
+MODEL_MODULE_PATH = "mambular.base_models"
+CONFIG_MODULE_PATH = "mambular.configs"
+
+# Discover all models
+model_classes = []
+for filename in os.listdir(os.path.dirname(__file__) + "/../mambular/base_models"):
+    if filename.endswith(".py") and filename not in [
+        "__init__.py",
+        "basemodel.py",
+        "lightning_wrapper.py",
+        "bayesian_tabm.py",
+    ]:
+        module_name = f"{MODEL_MODULE_PATH}.{filename[:-3]}"
+        module = importlib.import_module(module_name)
+
+        for name, obj in inspect.getmembers(module, inspect.isclass):
+            if issubclass(obj, BaseModel) and obj is not BaseModel:
+                model_classes.append(obj)
+
+
+def get_model_config(model_class):
+    """Dynamically load the correct config class for each model."""
+    model_name = model_class.__name__  # e.g., "Mambular"
+    config_class_name = f"Default{model_name}Config"  # e.g., "DefaultMambularConfig"
+
+    try:
+        config_module = importlib.import_module(
+            f"{CONFIG_MODULE_PATH}.{model_name.lower()}_config"
+        )
+        config_class = getattr(config_module, config_class_name)
+        return config_class()  # Instantiate config
+    except (ModuleNotFoundError, AttributeError) as e:
+        pytest.fail(
+            f"Could not find or instantiate config {config_class_name} for {model_name}: {e}"
+        )
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_inherits_base_model(model_class):
+    """Test that each model correctly inherits from BaseModel."""
+    assert issubclass(
+        model_class, BaseModel
+    ), f"{model_class.__name__} should inherit from BaseModel."
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_has_forward_method(model_class):
+    """Test that each model has a forward method with *data."""
+    assert hasattr(
+        model_class, "forward"
+    ), f"{model_class.__name__} is missing a forward method."
+
+    sig = inspect.signature(model_class.forward)
+    assert any(
+        p.kind == inspect.Parameter.VAR_POSITIONAL for p in sig.parameters.values()
+    ), f"{model_class.__name__}.forward should have *data argument."
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_takes_config(model_class):
+    """Test that each model accepts a config argument."""
+    sig = inspect.signature(model_class.__init__)
+    assert (
+        "config" in sig.parameters
+    ), f"{model_class.__name__} should accept a 'config' parameter."
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_has_num_classes(model_class):
+    """Test that each model accepts a num_classes argument."""
+    sig = inspect.signature(model_class.__init__)
+    assert (
+        "num_classes" in sig.parameters
+    ), f"{model_class.__name__} should accept a 'num_classes' parameter."
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_calls_super_init(model_class):
+    """Test that each model calls super().__init__(config=config, **kwargs)."""
+    source = inspect.getsource(model_class.__init__)
+    assert (
+        "super().__init__(config=config" in source
+    ), f"{model_class.__name__} should call super().__init__(config=config, **kwargs)."
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_initialization(model_class):
+    """Test that each model can be initialized with its correct config."""
+    config = get_model_config(model_class)
+    feature_info = (
+        {
+            "A": {
+                "preprocessing": "imputer -> check_positive -> box-cox",
+                "dimension": 1,
+                "categories": None,
+            }
+        },
+        {
+            "sibsp": {
+                "preprocessing": "imputer -> continuous_ordinal",
+                "dimension": 1,
+                "categories": 8,
+            }
+        },
+        {},
+    )  # Mock feature info
+
+    try:
+        model = model_class(
+            feature_information=feature_info, num_classes=3, config=config
+        )
+    except Exception as e:
+        pytest.fail(f"Failed to initialize {model_class.__name__}: {e}")
+
+
+@pytest.mark.parametrize("model_class", model_classes)
+def test_model_defines_key_attributes(model_class):
+    """Test that each model defines expected attributes like returns_ensemble"""
+    config = get_model_config(model_class)
+    feature_info = (
+        {
+            "A": {
+                "preprocessing": "imputer -> check_positive -> box-cox",
+                "dimension": 1,
+                "categories": None,
+            }
+        },
+        {
+            "sibsp": {
+                "preprocessing": "imputer -> continuous_ordinal",
+                "dimension": 1,
+                "categories": 8,
+            }
+        },
+        {},
+    )  # Mock feature info
+
+    try:
+        model = model_class(
+            feature_information=feature_info, num_classes=3, config=config
+        )
+    except TypeError as e:
+        pytest.fail(f"Failed to initialize {model_class.__name__}: {e}")
+
+    expected_attrs = ["returns_ensemble"]
+    for attr in expected_attrs:
+        assert hasattr(model, attr), f"{model_class.__name__} should define '{attr}'."
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
deleted file mode 100644
index 7243233..0000000
--- a/tests/test_classifier.py
+++ /dev/null
@@ -1,115 +0,0 @@
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pandas as pd
-import torch
-from sklearn.metrics import accuracy_score, log_loss
-
-from mambular.models import MambularClassifier  # Ensure correct import path
-
-
-class TestMambularClassifier(unittest.TestCase):
-    def setUp(self):
-        # Patching external dependencies
-        self.patcher_pl_trainer = patch("lightning.Trainer")
-        self.mock_pl_trainer = self.patcher_pl_trainer.start()
-
-        self.patcher_base_model = patch("mambular.base_models.classifier.BaseMambularClassifier")
-        self.mock_base_model = self.patcher_base_model.start()
-
-        self.classifier = MambularClassifier(d_model=128, dropout=0.1)
-
-        # Sample data
-        self.X = pd.DataFrame(np.random.randn(100, 10))
-        self.y = np.random.choice(["A", "B", "C"], size=100)
-
-        self.classifier.cat_feature_info = {}
-        self.classifier.num_feature_info = {}
-
-    def tearDown(self):
-        self.patcher_pl_trainer.stop()
-        self.patcher_base_model.stop()
-
-    def test_initialization(self):
-        # This assumes MambularConfig is properly imported and used in the MambularRegressor class
-        from mambular.utils.configs import DefaultMambularConfig
-
-        self.assertIsInstance(self.classifier.config, DefaultMambularConfig)
-        self.assertEqual(self.classifier.config.d_model, 128)
-        self.assertEqual(self.classifier.config.dropout, 0.1)
-
-    def test_split_data(self):
-        """Test the data splitting functionality."""
-        X_train, X_val, y_train, y_val = self.classifier.split_data(self.X, self.y, val_size=0.2, random_state=42)
-        self.assertEqual(len(X_train), 80)
-        self.assertEqual(len(X_val), 20)
-        self.assertEqual(len(y_train), 80)
-        self.assertEqual(len(y_val), 20)
-
-    def test_fit(self):
-        """Test the training setup and call."""
-        # Mock the necessary parts to simulate training
-        self.classifier.preprocess_data = MagicMock()
-        self.classifier.model = self.mock_base_model
-
-        self.classifier.fit(self.X, self.y)
-
-        # Ensure that the fit method of the trainer is called
-        self.mock_pl_trainer.return_value.fit.assert_called_once()
-
-    def test_predict(self):
-        # Create a mock tensor as the model output
-        # Assuming three classes A, B, C as per self.y
-        mock_logits = torch.rand(100, 3)
-
-        # Mock the model and its method calls
-        self.classifier.model = MagicMock()
-        self.classifier.model.eval.return_value = None
-        self.classifier.model.return_value = mock_logits
-
-        # Mock preprocess_test_data to return dummy tensor data
-        self.classifier.preprocess_test_data = MagicMock(return_value=([], []))
-
-        predictions = self.classifier.predict(self.X)
-
-        # Assert that predictions return as expected
-        expected_predictions = torch.argmax(mock_logits, dim=1).numpy()
-        np.testing.assert_array_equal(predictions, expected_predictions)
-
-    def test_evaluate(self):
-        # Mock predict and predict_proba to simulate classifier output
-        mock_predictions = np.random.choice([0, 1, 2], size=100)
-        raw_probabilities = np.random.rand(100, 3)
-        # Normalize these probabilities so that each row sums to 1
-        mock_probabilities = raw_probabilities / raw_probabilities.sum(axis=1, keepdims=True)
-        self.classifier.predict = MagicMock(return_value=mock_predictions)
-        self.classifier.predict_proba = MagicMock(return_value=mock_probabilities)
-
-        # Define metrics to test
-        metrics = {
-            "Accuracy": (accuracy_score, False),
-            # Log Loss requires probability scores
-            "Log Loss": (log_loss, True),
-        }
-
-        # Call evaluate with the defined metrics
-        result = self.classifier.evaluate(self.X, self.y, metrics=metrics)
-
-        # Assert that predict and predict_proba were called correctly
-        self.classifier.predict.assert_called_once()
-        self.classifier.predict_proba.assert_called_once()
-
-        # Check the results of evaluate
-        expected_accuracy = accuracy_score(self.y, mock_predictions)
-        expected_log_loss = log_loss(self.y, mock_probabilities)
-        self.assertEqual(result["Accuracy"], expected_accuracy)
-        self.assertAlmostEqual(result["Log Loss"], expected_log_loss)
-
-        # Assert calls with appropriate arguments
-        self.classifier.predict.assert_called_once_with(self.X)
-        self.classifier.predict_proba.assert_called_once_with(self.X)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_configs.py b/tests/test_configs.py
new file mode 100644
index 0000000..5299534
--- /dev/null
+++ b/tests/test_configs.py
@@ -0,0 +1,115 @@
+import pytest
+import inspect
+import importlib
+import os
+import dataclasses
+import typing
+from mambular.configs.base_config import BaseConfig  # Ensure correct path
+
+CONFIG_MODULE_PATH = "mambular.configs"
+config_classes = []
+
+# Discover all config classes in mambular/configs/
+for filename in os.listdir(os.path.dirname(__file__) + "/../mambular/configs"):
+    if (
+        filename.endswith(".py")
+        and filename != "base_config.py"
+        and not filename.startswith("__")
+    ):
+        module_name = f"{CONFIG_MODULE_PATH}.{filename[:-3]}"
+        module = importlib.import_module(module_name)
+
+        for name, obj in inspect.getmembers(module, inspect.isclass):
+            if issubclass(obj, BaseConfig) and obj is not BaseConfig:
+                config_classes.append(obj)
+
+
+@pytest.mark.parametrize("config_class", config_classes)
+def test_config_inherits_baseconfig(config_class):
+    """Test that each config class correctly inherits from BaseConfig."""
+    assert issubclass(
+        config_class, BaseConfig
+    ), f"{config_class.__name__} should inherit from BaseConfig."
+
+
+@pytest.mark.parametrize("config_class", config_classes)
+def test_config_instantiation(config_class):
+    """Test that each config class can be instantiated without errors."""
+    try:
+        config = config_class()
+    except Exception as e:
+        pytest.fail(f"Failed to instantiate {config_class.__name__}: {e}")
+
+
+@pytest.mark.parametrize("config_class", config_classes)
+def test_config_has_expected_attributes(config_class):
+    """Test that each config has all required attributes from BaseConfig."""
+    base_attrs = {field.name for field in dataclasses.fields(BaseConfig)}
+    config_attrs = {field.name for field in dataclasses.fields(config_class)}
+
+    missing_attrs = base_attrs - config_attrs
+    assert (
+        not missing_attrs
+    ), f"{config_class.__name__} is missing attributes: {missing_attrs}"
+
+
+@pytest.mark.parametrize("config_class", config_classes)
+def test_config_default_values(config_class):
+    """Ensure that each config class has default values assigned correctly."""
+    config = config_class()
+
+    for field in dataclasses.fields(config_class):
+        attr = field.name
+        expected_type = field.type
+
+        assert hasattr(
+            config, attr
+        ), f"{config_class.__name__} is missing attribute '{attr}'."
+
+        value = getattr(config, attr)
+
+        # Handle generic types properly
+        origin = typing.get_origin(expected_type)
+
+        if origin is typing.Literal:
+            # If the field is a Literal, ensure the value is one of the allowed options
+            allowed_values = typing.get_args(expected_type)
+            assert (
+                value in allowed_values
+            ), f"{config_class.__name__}.{attr} has incorrect value: expected one of {allowed_values}, got {value}"
+        elif origin is typing.Union:
+            # For Union types (e.g., Optional[str]), check if value matches any type in the union
+            allowed_types = typing.get_args(expected_type)
+            assert any(
+                isinstance(value, t) for t in allowed_types
+            ), f"{config_class.__name__}.{attr} has incorrect type: expected one of {allowed_types}, got {type(value)}"
+        elif origin is not None:
+            # If it's another generic type (e.g., list[str]), check against the base type
+            assert (
+                isinstance(value, origin) or value is None
+            ), f"{config_class.__name__}.{attr} has incorrect type: expected {expected_type}, got {type(value)}"
+        else:
+            # Standard type check
+            assert (
+                isinstance(value, expected_type) or value is None
+            ), f"{config_class.__name__}.{attr} has incorrect type: expected {expected_type}, got {type(value)}"
+
+
+@pytest.mark.parametrize("config_class", config_classes)
+def test_config_allows_updates(config_class):
+    """Ensure that config values can be updated and remain type-consistent."""
+    config = config_class()
+
+    update_values = {
+        "lr": 0.01,
+        "d_model": 128,
+        "embedding_type": "plr",
+        "activation": lambda x: x,  # Function update
+    }
+
+    for attr, new_value in update_values.items():
+        if hasattr(config, attr):
+            setattr(config, attr, new_value)
+            assert (
+                getattr(config, attr) == new_value
+            ), f"{config_class.__name__}.{attr} did not update correctly."
diff --git a/tests/test_distributions.py b/tests/test_distributions.py
deleted file mode 100644
index 1a8f2ca..0000000
--- a/tests/test_distributions.py
+++ /dev/null
@@ -1,311 +0,0 @@
-import unittest
-
-import torch
-
-from mambular.utils.distributions import (
-    BetaDistribution,
-    CategoricalDistribution,
-    DirichletDistribution,
-    GammaDistribution,
-    InverseGammaDistribution,
-    NegativeBinomialDistribution,
-    NormalDistribution,
-    PoissonDistribution,
-    StudentTDistribution,
-)
-
-
-class TestNormalDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the NormalDistribution object with default transforms."""
-        self.normal = NormalDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and default parameter settings."""
-        self.assertEqual(self.normal._name, "Normal")
-        self.assertEqual(self.normal.param_names, ["mean", "variance"])
-        self.assertIsInstance(self.normal.mean_transform, type(lambda x: x))
-        self.assertIsInstance(self.normal.variance_transform, type(torch.nn.functional.softplus))
-
-    def test_predefined_transforms(self):
-        """Test if predefined transformations are correctly applied."""
-        x = torch.tensor([-1.0, 0.0, 1.0])
-        self.assertTrue(torch.allclose(self.normal.mean_transform(x), x))  # 'none' should change nothing
-        self.assertTrue(
-            torch.all(torch.ge(self.normal.variance_transform(x), 0))
-        )  # 'positive' should make all values non-negative
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        predictions = torch.tensor([[0.0, 1.0]])  # mean = 0, variance = 1
-        y_true = torch.tensor([0.0])
-        self.normal = NormalDistribution()
-        loss = self.normal.compute_loss(predictions, y_true)
-        test_dist = torch.distributions.Normal(
-            loc=predictions[:, 0], scale=torch.nn.functional.softplus(predictions[:, 1])
-        )
-        expected_loss = -test_dist.log_prob(torch.tensor(0.0)).mean()
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-    def test_evaluate_nll(self):
-        """Test the evaluate NLL function."""
-        y_true = [0.0]
-        y_pred = [[0.0, 1.0]]  # mean=0, variance=1
-        result = self.normal.evaluate_nll(y_true, y_pred)
-        self.assertIn("NLL", result)
-        self.assertIn("mse", result)
-        self.assertIn("mae", result)
-        self.assertIn("rmse", result)
-
-
-class TestPoissonDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the PoissonDistribution object with default transform."""
-        self.poisson = PoissonDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.poisson._name, "Poisson")
-        self.assertEqual(self.poisson.param_names, ["rate"])
-        self.assertIsInstance(self.poisson.rate_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        predictions = torch.tensor([[1.0]])  # rate = 1
-        y_true = torch.tensor([1.0])
-        loss = self.poisson.compute_loss(predictions, y_true)
-        expected_loss = (
-            -torch.distributions.Poisson(torch.nn.functional.softplus(predictions[:, 0]))
-            .log_prob(torch.tensor(1.0))
-            .mean()
-        )
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestBetaDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the BetaDistribution object with default transforms."""
-        self.beta = BetaDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.beta._name, "Beta")
-        self.assertEqual(self.beta.param_names, ["alpha", "beta"])
-        self.assertIsInstance(self.beta.alpha_transform, type(torch.nn.functional.softplus))
-        self.assertIsInstance(self.beta.beta_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        predictions = torch.tensor([[1.0, 1.0]])  # alpha = 1, beta = 1 (uniform distribution)
-        y_true = torch.tensor([0.5])
-        loss = self.beta.compute_loss(predictions, y_true)
-        expected_loss = (
-            -torch.distributions.Beta(
-                torch.nn.functional.softplus(predictions[:, 0]),
-                torch.nn.functional.softplus(predictions[:, 1]),
-            )
-            .log_prob(torch.tensor(0.5))
-            .mean()
-        )
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestInverseGammaDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the InverseGammaDistribution object with default transforms."""
-        self.inverse_gamma = InverseGammaDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.inverse_gamma._name, "InverseGamma")
-        self.assertEqual(self.inverse_gamma.param_names, ["shape", "scale"])
-        self.assertIsInstance(self.inverse_gamma.shape_transform, type(torch.nn.functional.softplus))
-        self.assertIsInstance(self.inverse_gamma.scale_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        # These values for shape and scale parameters are chosen to be feasible and testable.
-        predictions = torch.tensor([[3.0, 2.0]])  # shape = 3, scale = 2
-        y_true = torch.tensor([0.5])
-
-        loss = self.inverse_gamma.compute_loss(predictions, y_true)
-        # Manually calculate the expected loss using torch's distribution functions
-        shape = torch.nn.functional.softplus(predictions[:, 0])
-        scale = torch.nn.functional.softplus(predictions[:, 1])
-        inverse_gamma_dist = torch.distributions.InverseGamma(shape, scale)
-        expected_loss = -inverse_gamma_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestDirichletDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the DirichletDistribution object with default transforms."""
-        self.dirichlet = DirichletDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.dirichlet._name, "Dirichlet")
-        # Concentration param_name is a simplification as mentioned in your class docstring
-        self.assertEqual(self.dirichlet.param_names, ["concentration"])
-        self.assertIsInstance(self.dirichlet.concentration_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        # These values are chosen to be feasible and testable.
-        # Example: Concentrations for a 3-dimensional Dirichlet distribution
-        predictions = torch.tensor(
-            [[1.0, 1.0, 1.0]]
-        )  # Equal concentration, should resemble uniform distribution over simplex
-        y_true = torch.tensor([[0.33, 0.33, 0.34]])  # Example point in the probability simplex
-
-        loss = self.dirichlet.compute_loss(predictions, y_true)
-        # Manually calculate the expected loss using torch's distribution functions
-        concentration = torch.nn.functional.softplus(predictions)
-        dirichlet_dist = torch.distributions.Dirichlet(concentration)
-        expected_loss = -dirichlet_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestGammaDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the GammaDistribution object with default transforms."""
-        self.gamma = GammaDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.gamma._name, "Gamma")
-        self.assertEqual(self.gamma.param_names, ["shape", "rate"])
-        self.assertIsInstance(self.gamma.shape_transform, type(torch.nn.functional.softplus))
-        self.assertIsInstance(self.gamma.rate_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        # Set some test parameters and observations
-        predictions = torch.tensor([[2.0, 3.0]])  # shape = 2, rate = 3
-        y_true = torch.tensor([0.5])  # Test value
-
-        loss = self.gamma.compute_loss(predictions, y_true)
-        # Manually calculate the expected loss using torch's distribution functions
-        shape = torch.nn.functional.softplus(predictions[:, 0])
-        rate = torch.nn.functional.softplus(predictions[:, 1])
-        gamma_dist = torch.distributions.Gamma(shape, rate)
-        expected_loss = -gamma_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestStudentTDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the StudentTDistribution object with default transforms."""
-        self.student_t = StudentTDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.student_t._name, "StudentT")
-        self.assertEqual(self.student_t.param_names, ["df", "loc", "scale"])
-        self.assertIsInstance(self.student_t.df_transform, type(torch.nn.functional.softplus))
-        self.assertIsInstance(
-            self.student_t.loc_transform,
-            type(lambda x: x),  # Assuming 'none' transformation
-        )
-        self.assertIsInstance(self.student_t.scale_transform, type(torch.nn.functional.softplus))
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        # Set some test parameters and observations
-        predictions = torch.tensor([[10.0, 0.0, 1.0]])  # df=10, loc=0, scale=1
-        y_true = torch.tensor([0.5])  # Test value
-
-        loss = self.student_t.compute_loss(predictions, y_true)
-        # Manually calculate the expected loss using torch's distribution functions
-        df = torch.nn.functional.softplus(predictions[:, 0])
-        loc = predictions[:, 1]  # 'none' transformation
-        scale = torch.nn.functional.softplus(predictions[:, 2])
-        student_t_dist = torch.distributions.StudentT(df, loc, scale)
-        expected_loss = -student_t_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-    def test_evaluate_nll(self):
-        """Test the evaluate NLL function and additional metrics."""
-        y_true = [0.5]
-        y_pred = [[10.0, 0.0, 1.0]]  # df=10, loc=0, scale=1
-        result = self.student_t.evaluate_nll(y_true, y_pred)
-
-        self.assertIn("NLL", result)
-        self.assertIn("mse", result)
-        self.assertIn("mae", result)
-        self.assertIn("rmse", result)
-
-        # Check that MSE, MAE, RMSE calculations are reasonable
-        self.assertGreaterEqual(result["mse"], 0)
-        self.assertGreaterEqual(result["mae"], 0)
-        self.assertGreaterEqual(result["rmse"], 0)
-
-
-class TestNegativeBinomialDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the NegativeBinomialDistribution object with default transforms."""
-        self.negative_binomial = NegativeBinomialDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.negative_binomial._name, "NegativeBinomial")
-        self.assertEqual(self.negative_binomial.param_names, ["mean", "dispersion"])
-        self.assertIsInstance(self.negative_binomial.mean_transform, type(torch.nn.functional.softplus))
-        self.assertIsInstance(
-            self.negative_binomial.dispersion_transform,
-            type(torch.nn.functional.softplus),
-        )
-
-    def test_compute_loss_known_values(self):
-        """Test the loss computation against known values."""
-        # Set some test parameters and observations
-        predictions = torch.tensor([[10.0, 0.1]])  # mean=10, dispersion=0.1
-        y_true = torch.tensor([5.0])  # Test value
-
-        loss = self.negative_binomial.compute_loss(predictions, y_true)
-        # Manually calculate the expected loss using torch's distribution functions
-        mean = torch.nn.functional.softplus(predictions[:, 0])
-        dispersion = torch.nn.functional.softplus(predictions[:, 1])
-        r = 1 / dispersion
-        p = r / (r + mean)
-        negative_binomial_dist = torch.distributions.NegativeBinomial(total_count=r, probs=p)
-        expected_loss = -negative_binomial_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-class TestCategoricalDistribution(unittest.TestCase):
-    def setUp(self):
-        """Initialize the CategoricalDistribution object with a probability transformation."""
-        self.categorical = CategoricalDistribution()
-
-    def test_initialization(self):
-        """Test the initialization and parameter settings."""
-        self.assertEqual(self.categorical._name, "Categorical")
-        self.assertEqual(self.categorical.param_names, ["probs"])
-        # The transformation function will need to ensure the probabilities are valid (non-negative and sum to 1)
-        # Typically, this might involve applying softmax to ensure the constraints are met.
-        # Here, we assume `prob_transform` is something akin to softmax for the sake of test setup.
-        self.assertIsInstance(self.categorical.probs_transform, type(torch.nn.functional.softmax))
-
-    def test_compute_loss_known_values(self):
-        # Example with three categories
-        logits = torch.tensor([[1.0, 2.0, 3.0], [1.0, 3.0, 4.0]])  # Logits for three categories
-        y_true = torch.tensor([2, 1])
-
-        loss = self.categorical.compute_loss(logits, y_true)
-        # Apply softmax to logits to convert them into probabilities
-        probs = torch.nn.functional.softmax(logits, dim=1)
-        cat_dist = torch.distributions.Categorical(probs=probs)
-        expected_loss = -cat_dist.log_prob(y_true).mean()
-
-        self.assertAlmostEqual(loss.item(), expected_loss.item(), places=5)
-
-
-# Running the tests
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_lss.py b/tests/test_lss.py
deleted file mode 100644
index 01192db..0000000
--- a/tests/test_lss.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pandas as pd
-from properscoring import (
-    crps_gaussian,
-)
-
-# Assuming this is the source of the CRPS function
-from sklearn.metrics import mean_poisson_deviance, mean_squared_error
-
-from mambular.models import MambularLSS  # Update the import path
-
-
-class TestMambularLSS(unittest.TestCase):
-    def setUp(self):
-        # Patch PyTorch Lightning's Trainer and any other external dependencies
-        self.patcher_trainer = patch("lightning.Trainer")
-        self.mock_trainer = self.patcher_trainer.start()
-
-        self.patcher_base_model = patch("mambular.base_models.distributional.BaseMambularLSS")
-        self.mock_base_model = self.patcher_base_model.start()
-
-        # Initialize MambularLSS with example parameters
-        self.model = MambularLSS(d_model=128, dropout=0.1, n_layers=4)
-
-        # Sample data
-        self.X = pd.DataFrame(np.random.randn(100, 10))
-        self.y = np.random.rand(100)
-
-        self.model.cat_feature_info = {}
-        self.model.num_feature_info = {}
-
-        self.X_test = pd.DataFrame(np.random.randn(100, 10))
-        self.y_test = np.random.rand(100) ** 2
-
-    def tearDown(self):
-        self.patcher_trainer.stop()
-        self.patcher_base_model.stop()
-
-    def test_initialization(self):
-        from mambular.utils.configs import DefaultMambularConfig
-
-        self.assertIsInstance(self.model.config, DefaultMambularConfig)
-        self.assertEqual(self.model.config.d_model, 128)
-        self.assertEqual(self.model.config.dropout, 0.1)
-        self.assertEqual(self.model.config.n_layers, 4)
-
-    def test_split_data(self):
-        X_train, X_val, y_train, y_val = self.model.split_data(self.X, self.y, val_size=0.2, random_state=42)
-        self.assertEqual(len(X_train), 80)
-        self.assertEqual(len(X_val), 20)
-        self.assertEqual(len(y_train), 80)
-        self.assertEqual(len(y_val), 20)
-
-    def test_fit(self):
-        # Mock preprocessing and model setup to focus on testing training logic
-        self.model.preprocess_data = MagicMock()
-        self.model.model = self.mock_base_model
-
-        self.model.fit(self.X, self.y, family="normal")
-
-        # Ensure the fit method of the trainer is called
-        self.mock_trainer.return_value.fit.assert_called_once()
-
-    def test_normal_metrics(self):
-        # Mock predictions for the normal distribution: [mean, variance]
-        mock_predictions = np.column_stack((np.random.normal(size=100), np.abs(np.random.normal(size=100))))
-        self.model.predict = MagicMock(return_value=mock_predictions)
-
-        # Define custom metrics or use a function that fetches appropriate metrics
-        self.model.get_default_metrics = MagicMock(
-            return_value={
-                "MSE": lambda y, pred: mean_squared_error(y, pred[:, 0]),
-                "CRPS": lambda y, pred: np.mean(
-                    [crps_gaussian(y[i], mu=pred[i, 0], sig=np.sqrt(pred[i, 1])) for i in range(len(y))]
-                ),
-            }
-        )
-
-        results = self.model.evaluate(self.X_test, self.y_test, distribution_family="normal")
-
-        # Validate the MSE
-        expected_mse = mean_squared_error(self.y_test, mock_predictions[:, 0])
-        self.assertAlmostEqual(results["MSE"], expected_mse, places=4)
-        self.assertIn("CRPS", results)  # Check for existence but not the exact value in this test
-
-    def test_poisson_metrics(self):
-        # Mock predictions for Poisson
-        mock_predictions = np.random.poisson(lam=3, size=100) + 1e-3
-        self.model.predict = MagicMock(return_value=mock_predictions)
-
-        self.model.get_default_metrics = MagicMock(return_value={"Poisson Deviance": mean_poisson_deviance})
-
-        results = self.model.evaluate(self.X_test, self.y_test, distribution_family="poisson")
-        self.assertIn("Poisson Deviance", results)
-        # Optionally calculate expected deviance and check
-        expected_deviance = mean_poisson_deviance(self.y_test, mock_predictions)
-        self.assertAlmostEqual(results["Poisson Deviance"], expected_deviance)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/tests/test_preprocessor.py b/tests/test_preprocessor.py
index 68c3265..5c04630 100644
--- a/tests/test_preprocessor.py
+++ b/tests/test_preprocessor.py
@@ -1,82 +1,111 @@
-import unittest
-
+import pytest
 import numpy as np
 import pandas as pd
 from sklearn.exceptions import NotFittedError
+from mambular.preprocessing import Preprocessor
+
+
+@pytest.fixture
+def sample_data():
+    return pd.DataFrame(
+        {
+            "numerical": np.random.randn(100),
+            "categorical": np.random.choice(["A", "B", "C"], size=100),
+            "integer": np.random.randint(0, 5, size=100),
+        }
+    )
+
+
+@pytest.fixture
+def sample_target():
+    return np.random.randn(100)
+
+
+@pytest.fixture(
+    params=[
+        "ple",
+        "binning",
+        "one-hot",
+        "standardization",
+        "minmax",
+        "quantile",
+        "polynomial",
+        "robust",
+        "splines",
+        "yeo-johnson",
+        "box-cox",
+        "rbf",
+        "sigmoid",
+        "none",
+    ]
+)
+def preprocessor(request):
+    return Preprocessor(
+        numerical_preprocessing=request.param, categorical_preprocessing="one-hot"
+    )
+
+
+def test_preprocessor_initialization(preprocessor):
+    assert preprocessor.numerical_preprocessing in [
+        "ple",
+        "binning",
+        "one-hot",
+        "standardization",
+        "minmax",
+        "quantile",
+        "polynomial",
+        "robust",
+        "splines",
+        "yeo-johnson",
+        "box-cox",
+        "rbf",
+        "sigmoid",
+        "none",
+    ]
+    assert preprocessor.categorical_preprocessing == "one-hot"
+    assert not preprocessor.fitted
+
+
+def test_preprocessor_fit(preprocessor, sample_data, sample_target):
+    preprocessor.fit(sample_data, sample_target)
+    assert preprocessor.fitted
+    assert preprocessor.column_transformer is not None
+
+
+def test_preprocessor_transform(preprocessor, sample_data, sample_target):
+    preprocessor.fit(sample_data, sample_target)
+    transformed = preprocessor.transform(sample_data)
+    assert isinstance(transformed, dict)
+    assert len(transformed) > 0
+
+
+def test_preprocessor_fit_transform(preprocessor, sample_data, sample_target):
+    transformed = preprocessor.fit_transform(sample_data, sample_target)
+    assert isinstance(transformed, dict)
+    assert len(transformed) > 0
+
+
+def test_preprocessor_get_params(preprocessor):
+    params = preprocessor.get_params()
+    assert "n_bins" in params
+    assert "numerical_preprocessing" in params
+
+
+def test_preprocessor_set_params(preprocessor):
+    preprocessor.set_params(n_bins=128)
+    assert preprocessor.n_bins == 128
+
+
+def test_transform_before_fit_raises_error(preprocessor, sample_data):
+    with pytest.raises(NotFittedError):
+        preprocessor.transform(sample_data)
+
 
-from mambular.utils.preprocessor import Preprocessor
-
-
-class TestPreprocessor(unittest.TestCase):
-    def setUp(self):
-        # Sample data for testing
-        self.data = pd.DataFrame(
-            {
-                "numerical": np.random.randn(500),
-                "categorical": np.random.choice(["A", "B", "C"], size=500),
-                "mixed": np.random.choice([1, "A", "B"], size=500),
-            }
-        )
-        self.target = np.random.randn(500)
-
-    def test_initialization(self):
-        """Test initialization of the Preprocessor with default parameters."""
-        pp = Preprocessor(n_bins=20, numerical_preprocessing="binning")
-        self.assertEqual(pp.n_bins, 20)
-        self.assertEqual(pp.numerical_preprocessing, "binning")
-        self.assertFalse(pp.use_decision_tree_bins)
-
-    def test_fit(self):
-        """Test the fitting process of the preprocessor."""
-        pp = Preprocessor(numerical_preprocessing="binning", n_bins=20)
-        pp.fit(self.data, self.target)
-        self.assertIsNotNone(pp.column_transformer)
-
-    def test_transform_not_fitted(self):
-        """Test that transform raises an error if called before fitting."""
-        pp = Preprocessor()
-        with self.assertRaises(NotFittedError):
-            pp.transform(self.data)
-
-    def test_fit_transform(self):
-        """Test fitting and transforming the data."""
-        pp = Preprocessor(numerical_preprocessing="standardization")
-        transformed_data = pp.fit_transform(self.data, self.target)
-        self.assertIsInstance(transformed_data, dict)
-        self.assertTrue("num_numerical" in transformed_data)
-        self.assertTrue("cat_categorical" in transformed_data)
-
-    def test_ple(self):
-        """Test fitting and transforming the data."""
-        pp = Preprocessor(numerical_preprocessing="ple", n_bins=20)
-        transformed_data = pp.fit_transform(self.data, self.target)
-        self.assertIsInstance(transformed_data, dict)
-        self.assertTrue("num_numerical" in transformed_data)
-        self.assertTrue("cat_categorical" in transformed_data)
-
-    def test_transform_with_missing_values(self):
-        """Ensure the preprocessor can handle missing values."""
-        data_with_missing = self.data.copy()
-        data_with_missing.loc[0, "numerical"] = np.nan
-        data_with_missing.loc[1, "categorical"] = np.nan
-        pp = Preprocessor(numerical_preprocessing="normalization")
-        transformed_data = pp.fit_transform(data_with_missing, self.target)
-        self.assertNotIn(np.nan, transformed_data["num_numerical"])
-        self.assertNotIn(np.nan, transformed_data["cat_categorical"])
-
-    def test_decision_tree_bins(self):
-        """Test the usage of decision tree for binning."""
-        pp = Preprocessor(use_decision_tree_bins=True, numerical_preprocessing="binning", n_bins=5)
-        pp.fit(self.data, self.target)
-        # Checking if the preprocessor setup decision tree bins properly
-        self.assertTrue(
-            all(
-                isinstance(x, np.ndarray)
-                for x in pp._get_decision_tree_bins(self.data[["numerical"]], self.target, ["numerical"])
-            )
-        )
-
-
-# Running the tests
-if __name__ == "__main__":
-    unittest.main()
+def test_get_feature_info(preprocessor, sample_data, sample_target):
+    preprocessor.fit(sample_data, sample_target)
+    numerical_info, categorical_info, embedding_info = preprocessor.get_feature_info(
+        verbose=False
+    )
+    assert isinstance(numerical_info, dict)
+    assert isinstance(categorical_info, dict)
+    assert isinstance(embedding_info, dict)
diff --git a/tests/test_regressor.py b/tests/test_regressor.py
deleted file mode 100644
index 8626039..0000000
--- a/tests/test_regressor.py
+++ /dev/null
@@ -1,103 +0,0 @@
-import unittest
-from unittest.mock import MagicMock, patch
-
-import numpy as np
-import pandas as pd
-from sklearn.metrics import mean_squared_error, r2_score
-
-from mambular.models import MambularRegressor  # Ensure correct import path
-
-
-class TestMambularRegressor(unittest.TestCase):
-    def setUp(self):
-        # Patching external dependencies
-        self.patcher_pl_trainer = patch("lightning.Trainer")
-        self.mock_pl_trainer = self.patcher_pl_trainer.start()
-
-        self.patcher_base_model = patch("mambular.base_models.regressor.BaseMambularRegressor")
-        self.mock_base_model = self.patcher_base_model.start()
-
-        self.regressor = MambularRegressor(d_model=128, dropout=0.1)
-
-        # Sample data
-        self.X = pd.DataFrame(np.random.randn(100, 10))
-        self.y = np.random.rand(100)
-
-        self.regressor.cat_feature_info = {}
-        self.regressor.num_feature_info = {}
-
-    def tearDown(self):
-        self.patcher_pl_trainer.stop()
-        self.patcher_base_model.stop()
-
-    def test_initialization(self):
-        # This assumes MambularConfig is properly imported and used in the MambularRegressor class
-        from mambular.utils.configs import DefaultMambularConfig
-
-        self.assertIsInstance(self.regressor.config, DefaultMambularConfig)
-        self.assertEqual(self.regressor.config.d_model, 128)
-        self.assertEqual(self.regressor.config.dropout, 0.1)
-
-    def test_split_data(self):
-        """Test the data splitting functionality."""
-        X_train, X_val, y_train, y_val = self.regressor.split_data(self.X, self.y, val_size=0.2, random_state=42)
-        self.assertEqual(len(X_train), 80)
-        self.assertEqual(len(X_val), 20)
-        self.assertEqual(len(y_train), 80)
-        self.assertEqual(len(y_val), 20)
-
-    def test_fit(self):
-        """Test the training setup and call."""
-        # Mock the necessary parts to simulate training
-        self.regressor.preprocess_data = MagicMock()
-        self.regressor.model = self.mock_base_model
-
-        self.regressor.fit(self.X, self.y)
-
-        # Ensure that the fit method of the trainer is called
-        self.mock_pl_trainer.return_value.fit.assert_called_once()
-
-    def test_predict(self):
-        # Create mock return objects that mimic tensor behavior
-        mock_prediction = MagicMock()
-        mock_prediction.cpu.return_value = MagicMock()
-        mock_prediction.cpu.return_value.numpy.return_value = np.array([0.5] * 100)
-
-        # Mock the model and its method calls
-        self.regressor.model = MagicMock()
-        self.regressor.model.eval.return_value = None
-        self.regressor.model.return_value = mock_prediction
-
-        # Mock preprocess_test_data to return dummy tensor data
-        self.regressor.preprocess_test_data = MagicMock(return_value=([], []))
-
-        predictions = self.regressor.predict(self.X)
-
-        # Assert that predictions return as expected
-        np.testing.assert_array_equal(predictions, np.array([0.5] * 100))
-
-    def test_evaluate(self):
-        # Mock the predict method to simulate regressor output
-        mock_predictions = np.random.rand(100)
-        self.regressor.predict = MagicMock(return_value=mock_predictions)
-
-        # Define metrics to test
-        metrics = {"Mean Squared Error": mean_squared_error, "R2 Score": r2_score}
-
-        # Call evaluate with the defined metrics
-        result = self.regressor.evaluate(self.X, self.y, metrics=metrics)
-
-        # Compute expected metrics directly
-        expected_mse = mean_squared_error(self.y, mock_predictions)
-        expected_r2 = r2_score(self.y, mock_predictions)
-
-        # Check the results of evaluate
-        self.assertAlmostEqual(result["Mean Squared Error"], expected_mse)
-        self.assertAlmostEqual(result["R2 Score"], expected_r2)
-
-        # Ensure predict was called correctly
-        self.regressor.predict.assert_called_once_with(self.X)
-
-
-if __name__ == "__main__":
-    unittest.main()

From 161f6de0c1eb21a7f5575471f376d777a353443e Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:14:42 +0100
Subject: [PATCH 18/24] remove dependence on rotary embeddings

---
 mambular/arch_utils/layer_utils/attention_utils.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/mambular/arch_utils/layer_utils/attention_utils.py b/mambular/arch_utils/layer_utils/attention_utils.py
index bdfed31..1b50d72 100644
--- a/mambular/arch_utils/layer_utils/attention_utils.py
+++ b/mambular/arch_utils/layer_utils/attention_utils.py
@@ -5,7 +5,6 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange
-from rotary_embedding_torch import RotaryEmbedding
 
 
 class GEGLU(nn.Module):
@@ -25,7 +24,7 @@ def FeedForward(dim, mult=4, dropout=0.0):
 
 
 class Attention(nn.Module):
-    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary=False):
+    def __init__(self, dim, heads=8, dim_head=64, dropout=0.0):
         super().__init__()
         inner_dim = dim_head * heads
         self.heads = heads
@@ -34,18 +33,13 @@ def __init__(self, dim, heads=8, dim_head=64, dropout=0.0, rotary=False):
         self.to_qkv = nn.Linear(dim, inner_dim * 3, bias=False)
         self.to_out = nn.Linear(inner_dim, dim, bias=False)
         self.dropout = nn.Dropout(dropout)
-        self.rotary = rotary
         dim = np.int64(dim / 2)
-        self.rotary_embedding = RotaryEmbedding(dim=dim)
 
     def forward(self, x):
         h = self.heads
         x = self.norm(x)
         q, k, v = self.to_qkv(x).chunk(3, dim=-1)
         q, k, v = map(lambda t: rearrange(t, "b n (h d) -> b h n d", h=h), (q, k, v))  # type: ignore
-        if self.rotary:
-            q = self.rotary_embedding.rotate_queries_or_keys(q)
-            k = self.rotary_embedding.rotate_queries_or_keys(k)
         q = q * self.scale
 
         sim = torch.einsum("b h i d, b h j d -> b h i j", q, k)
@@ -61,7 +55,7 @@ def forward(self, x):
 
 
 class Transformer(nn.Module):
-    def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout, rotary=False):
+    def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout):
         super().__init__()
         self.layers = nn.ModuleList([])
 
@@ -74,7 +68,6 @@ def __init__(self, dim, depth, heads, dim_head, attn_dropout, ff_dropout, rotary
                             heads=heads,
                             dim_head=dim_head,
                             dropout=attn_dropout,
-                            rotary=rotary,
                         ),
                         FeedForward(dim, dropout=ff_dropout),
                     ]

From bd998d3580160dff5b6c681755df27b317ec0992 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:18:08 +0100
Subject: [PATCH 19/24] =?UTF-8?q?include=20params=20rel=C3=B6ated=20to=20[?=
 =?UTF-8?q?BUG]=20Missing=20Configuration=20Attributes=20in=20DefaultMambu?=
 =?UTF-8?q?larConfig=20#209?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 mambular/configs/mambular_config.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/mambular/configs/mambular_config.py b/mambular/configs/mambular_config.py
index 8bc2f90..a60b54e 100644
--- a/mambular/configs/mambular_config.py
+++ b/mambular/configs/mambular_config.py
@@ -64,6 +64,12 @@ class DefaultMambularConfig(BaseConfig):
         Whether to use PSCAN for the state-space model.
     mamba_version : str, default="mamba-torch"
         Version of the Mamba model to use ('mamba-torch', 'mamba1', 'mamba2').
+    conv_bias : bool, default=False
+        Whether to use a bias in the 1D convolution before each mamba block
+    AD_weight_decay: bool = True
+        Whether to use weight decay als for the A and D matrices in Mamba
+    BC_layer_norm: bool = False
+        Whether to use layer norm on the B and C matrices
     """
 
     # Architecture Parameters
@@ -82,6 +88,9 @@ class DefaultMambularConfig(BaseConfig):
     dt_init_floor: float = 1e-04
     norm: str = "RMSNorm"
     activation: Callable = nn.SiLU()  # noqa: RUF009
+    conv_bias: bool = False
+    AD_weight_decay: bool = True
+    BC_layer_norm: bool = False
 
     # Embedding Parameters
     shuffle_embeddings: bool = False

From 44d3b3a69b722c0a2f3361968d957749d3b220da Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:32:26 +0100
Subject: [PATCH 20/24] test new unit test for pr-requests

---
 .github/workflows/pr-tests.yml | 37 ++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 .github/workflows/pr-tests.yml

diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
new file mode 100644
index 0000000..4e22604
--- /dev/null
+++ b/.github/workflows/pr-tests.yml
@@ -0,0 +1,37 @@
+name: PR Unit Tests
+
+on:
+  pull_request:
+    branches:
+      - develop
+      - master  # Add any other branches where you want to enforce tests
+
+jobs:
+  test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.12"  # Change this to match your setup
+
+      - name: Install Dependencies
+        run: |
+          python -m pip install --upgrade pip
+          poetry install
+          pip install pytest
+
+      - name: Run Unit Tests
+        run: pytest tests/
+
+      - name: Verify Tests Passed
+        if: ${{ success() }}
+        run: echo "All tests passed! Pull request is allowed."
+
+      - name: Fail PR on Test Failure
+        if: ${{ failure() }}
+        run: exit 1  # This ensures the PR cannot be merged if tests fail

From 5fc2ed71b31ce9e978cc0bee3491c33042084d3b Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:35:02 +0100
Subject: [PATCH 21/24] change py-version

---
 .github/workflows/pr-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
index 4e22604..2e65f57 100644
--- a/.github/workflows/pr-tests.yml
+++ b/.github/workflows/pr-tests.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.12"  # Change this to match your setup
+          python-version: "3.8"  # Change this to match your setup
 
       - name: Install Dependencies
         run: |

From e722767b3e005450103cc5f974be6beea81b6df5 Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:36:53 +0100
Subject: [PATCH 22/24] adapt test to .py version 3.10

---
 .github/workflows/pr-tests.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
index 2e65f57..fd31668 100644
--- a/.github/workflows/pr-tests.yml
+++ b/.github/workflows/pr-tests.yml
@@ -17,7 +17,7 @@ jobs:
       - name: Set up Python
         uses: actions/setup-python@v4
         with:
-          python-version: "3.8"  # Change this to match your setup
+          python-version: "3.10"  # Change this to match your setup
 
       - name: Install Dependencies
         run: |

From 1fcb03023378e362c84d841b8d385f67d986ca6b Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:39:14 +0100
Subject: [PATCH 23/24] install poetry in workflow

---
 .github/workflows/pr-tests.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
index fd31668..ec85672 100644
--- a/.github/workflows/pr-tests.yml
+++ b/.github/workflows/pr-tests.yml
@@ -18,6 +18,11 @@ jobs:
         uses: actions/setup-python@v4
         with:
           python-version: "3.10"  # Change this to match your setup
+      
+      - name: Install Poetry
+        run: |
+          curl -sSL https://install.python-poetry.org | python3 -
+          export PATH="$HOME/.local/bin:$PATH"
 
       - name: Install Dependencies
         run: |

From ac27a1da8a0a2299ee21344facee6b007f3dbfcd Mon Sep 17 00:00:00 2001
From: AnFreTh <antonthielmann@t-online.de>
Date: Wed, 12 Feb 2025 14:43:56 +0100
Subject: [PATCH 24/24] ensure mambular is locally installed

---
 .github/workflows/pr-tests.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/pr-tests.yml b/.github/workflows/pr-tests.yml
index ec85672..0e878fe 100644
--- a/.github/workflows/pr-tests.yml
+++ b/.github/workflows/pr-tests.yml
@@ -22,6 +22,7 @@ jobs:
       - name: Install Poetry
         run: |
           curl -sSL https://install.python-poetry.org | python3 -
+          echo "$HOME/.local/bin" >> $GITHUB_PATH
           export PATH="$HOME/.local/bin:$PATH"
 
       - name: Install Dependencies
@@ -30,7 +31,14 @@ jobs:
           poetry install
           pip install pytest
 
+      - name: Install Package Locally
+        run: |
+          poetry build
+          pip install dist/*.whl  # Install the built package to fix "No module named 'mambular'"
+
       - name: Run Unit Tests
+        env:
+          PYTHONPATH: ${{ github.workspace }}  # Ensure the package is discoverable
         run: pytest tests/
 
       - name: Verify Tests Passed