opcode81 · MischaPanch · Feb 29, 2024 · Feb 21, 2024 · Feb 21, 2024 · Feb 26, 2024
diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py
@@ -479,7 +479,7 @@ def _apply(self, df: pd.DataFrame) -> pd.DataFrame:
         if self.keep is not None:
             df = df.loc[self.keep]
         if self.drop is not None:
-            df = df.drop(self.drop)
+            df = df.drop(self.drop)  # type: ignore
         return df
 
 
@@ -493,101 +493,143 @@ class DFTNormalisation(DataFrameTransformer):
 
     class RuleTemplate:
         def __init__(self,
-                skip=False,
-                unsupported=False,
+                skip: bool = False,
+                unsupported: bool = False,
                 transformer: Optional[SkLearnTransformerProtocol] = None,
                 transformer_factory: Callable[[], SkLearnTransformerProtocol] = None,
-                independent_columns: Optional[bool] = None):
+                independent_columns: Optional[bool] = None,
+                array_valued: bool = False,
+                fit: bool = True):
             """
-            Creates a rule template which applies to one or more features/columns (depending on context).
-            Use parameters as follows:
-
-                * If the relevant features are already normalised, pass ``skip=True``
-                * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True``
-                * If the relevant features shall be normalised, the other parameters apply.
-                  No parameters, i.e. ``RuleTemplate()``, are an option if ...
-
-                    * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application
-                      is suitable for the relevant set of features.
-                      Otherwise, specify either ``transformerFactory`` or ``transformer``.
-                    * all relevant features are to be normalised in the same way.
-                      Otherwise, specify ``independentColumns=True``.
-
-            :param skip: flag indicating whether no transformation shall be performed on all of the columns (because they are already
-                normalised)
-            :param unsupported: flag indicating whether normalisation of all columns is unsupported (shall trigger an exception if
-                attempted)
-            :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s)
+            A template from which a rule which matches multiple columns can be created.
+            This is useful for the generation of rules which shall apply to all the (numerical) columns generated
+            by a :class:`FeatureGenerator` without specifically naming them.
+
+            Use the parameters as follows:
+
+            * If the relevant features are already normalised, pass ``skip=True``
+            * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True``
+            * If the relevant features shall be normalised, the other parameters apply.
+              No parameters, i.e. ``RuleTemplate()``, are an option if ...
+
+                * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application
+                  is suitable for the relevant set of features.
+                  Otherwise, specify either ``transformer_factory`` or ``transformer``.
+                * the resulting rule will match only a single column. Otherwise, ``independent_columns``
+                  must be specified to True or False.
+
+            :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already
+                normalised).
+            :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if
+                attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making
+                their way into the final dataframe that will be normalised and used for training a model.
+            :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s)
                 for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either
-                transformerFactory or the containing instance's default factory will be used.
-                NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same
-                feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory.
+                transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is
+                fitted.
+                NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the
+                same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same
+                column with associated rule/rule template (disabling `fit` where appropriate).
             :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if
-                `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default
+                `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default
                 factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options.
+            :param array_valued: only allowed if the rule matches a single column. If True, it expresses that
+                column values are not scalars but arrays (of arbitrary lengths).
+                It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same
+                transformation will be applied to each entry in the array.
+            :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is
+                if a transformer instance instead of a factory is given and the transformer is already fitted.
             :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a
                 separate transformation is to be learned for each of them (rather than using the same transformation for all columns and
-                learning the transformation from the data of all columns); must be specified for rules matching more than one column,
-                None is acceptable only for a single column
+                learning the transformation from the data of all columns).
+                This parameter must be specified for rules matching more than one column,
+                None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect.
             """
+            # NOTE: keep in sync with Rule!
             if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0:
-                raise ValueError("Passed transformer or transformerFactory while skip=True or unsupported=True")
+                raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True")
             self.skip = skip
             self.unsupported = unsupported
             self.transformer = transformer
             self.transformerFactory = transformer_factory
             self.independentColumns = independent_columns
+            self.arrayValued = array_valued
+            self.fit = fit
 
-        def to_rule(self, regex: Optional[str]):
+        def to_rule(self, regex: Optional[Union[str, re.Pattern]]):
             """
             Convert the template to a rule for all columns matching the regex
 
             :param regex: a regular expression defining the column the rule applies to
             :return: the resulting Rule
             """
             return DFTNormalisation.Rule(regex, skip=self.skip, unsupported=self.unsupported, transformer=self.transformer,
-                transformer_factory=self.transformerFactory, independent_columns=self.independentColumns)
+                transformer_factory=self.transformerFactory, independent_columns=self.independentColumns, array_valued=self.arrayValued, fit=self.fit)
 
         def to_placeholder_rule(self):
             return self.to_rule(None)
 
     class Rule(ToStringMixin):
         def __init__(self,
-                regex: Optional[str],
-                skip=False, unsupported=False,
-                transformer: SkLearnTransformerProtocol = None,
-                transformer_factory: Callable[[], SkLearnTransformerProtocol] = None,
-                array_valued=False,
-                fit=True,
+                regex: Optional[Union[str, re.Pattern]],
+                skip: bool = False,
+                unsupported: bool = False,
+                transformer: Optional[SkLearnTransformerProtocol] = None,
+                transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None,
+                array_valued: bool = False,
+                fit: bool = True,
                 independent_columns: Optional[bool] = None):
             """
+            Use the parameters as follows:
+
+                * If the relevant features are already normalised, pass ``skip=True``
+                * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True``
+                * If the relevant features shall be normalised, the other parameters apply.
+                  No parameters other than regex, i.e. ``Rule(regex)``, are an option if ...
+
+                    * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application
+                      is suitable for the relevant set of features.
+                      Otherwise, specify either ``transformer_factory`` or ``transformer``.
+                    * the resulting rule will match only a single column. Otherwise, ``independent_columns``
+                      must be specified to True or False.
+
             :param regex: a regular expression defining the column(s) the rule applies to.
-                If it applies to multiple columns, these columns will be normalised in the same way (using the same normalisation
-                process for each column) unless independentColumns=True.
-                If None, the rule is a placeholder rule and the regex must be set later via setRegex or the rule will not be applicable.
-            :param skip: flag indicating whether no transformation shall be performed on the matching column(s)
-            :param unsupported: flag indicating whether normalisation of the matching column(s) is unsupported (shall trigger an exception
-                if attempted)
-            :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s)
+                If it matches multiple columns, these columns will be normalised in the same way (using the same normalisation
+                process for each column) unless independent_columns=True.
+                If None, the rule is a placeholder rule and the regex must be set later via set_regex or the rule will not be applicable.
+            :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already
+                normalised).
+            :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if
+                attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making
+                their way into the final dataframe that will be normalised and used for training a model.
+            :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s)
                 for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either
-                transformerFactory or the containing instance's default factory will be used.
-                NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same
-                feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory.
+                transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is
+                fitted.
+                NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the
+                same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same
+                column with associated rule/rule template (disabling `fit` where appropriate).
             :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if
-                `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default
+                `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default
                 factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options.
-            :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths).
-                It is assumed that all entries in such arrays are to be normalised in the same way.
-                If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column.
-            :param fit: whether the rule's transformer shall be fitted
+            :param array_valued: only allowed if the rule matches a single column. If True, it expresses that
+                column values are not scalars but arrays (of arbitrary lengths).
+                It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same
+                transformation will be applied to each entry in the array.
+            :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is
+                if a transformer instance instead of a factory is given and the transformer is already fitted.
             :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a
                 separate transformation is to be learned for each of them (rather than using the same transformation for all columns and
-                learning the transformation from the data of all columns); must be specified for rules matching more than one column, None
-                is acceptable only for single-column rules
+                learning the transformation from the data of all columns).
+                This parameter must be specified to for rules matching more than one column,
+                None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect.
             """
-            if skip and (transformer is not None or transformer_factory is not None):
-                raise ValueError("skip==True while transformer/transformerFactory is not None")
-            self.regex = re.compile(regex) if regex is not None else None
+            if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0:
+                raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True")
+            if isinstance(regex, str):
+                regex = re.compile(regex)
+            self.regex = regex
+            # NOTE: keep in sync with RuleTemplate!
             self.skip = skip
             self.unsupported = unsupported
             self.transformer = transformer
@@ -624,18 +666,21 @@ def matches(self, column: str):
         def matching_columns(self, columns: Sequence[str]) -> List[str]:
             return [col for col in columns if self.matches(col)]
 
-    def __init__(self, rules: Sequence[Rule], default_transformer_factory=None, require_all_handled=True, inplace=False):
+    def __init__(self, rules: Sequence[Rule], default_transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None,
+            require_all_handled: bool = True, inplace: bool = False):
         """
-        :param rules: the set of rules; rules are always fitted and applied in the given order.
+        :param rules: the set of rules; rules (i.e., their transformers) are always fitted and applied in the given order.
             A convenient way to obtain a set of rules in the :class:`sensai.vector_model.VectorModel` context is from a
             :class:`sensai.featuregen.FeatureCollector` or :class:`sensai.featuregen.MultiFeatureGenerator`.
+            Generally, it is often a good idea to associate rules (or a rule template) with a feature generator.
+            Then the rules can be obtained from it using `get_normalisation_rules`.
         :param default_transformer_factory: a factory for the creation of transformer instances (which implements the
             API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all
             rules that do not specify a particular transformer.
             The default transformer will only be applied to columns matched by such rules, unmatched columns will
             not be transformed.
-            Use SkLearnTransformerFactoryFactory to conveniently create a factory.
-        :param require_all_handled: whether to raise an exception if not all columns are matched by a rule
+            Use :class:`SkLearnTransformerFactoryFactory` to conveniently create a factory.
+        :param require_all_handled: whether to raise an exception if any column is not matched by a rule
         :param inplace: whether to apply data frame transformations in-place
         """
         super().__init__()
@@ -656,6 +701,10 @@ def _tostring_additional_entries(self) -> Dict[str, Any]:
     def _fit(self, df: pd.DataFrame):
         matched_rules_by_column = {}
         self._rules = []
+        # For rules matching multiple columns, if independent_columns is False, the columns
+        # will be concatenated and treated as a single column for fitting the transformer.
+        # Note that transformers follow sklearn interfaces, thus just passing an array
+        # to them will learn a per-column-transformation. This will be the case for independent_columns=True.
         for rule in self._userRules:
             matching_columns = rule.matching_columns(df.columns)
             for c in matching_columns:

diff --git a/src/sensai/data_transformation/sklearn_transformer.py b/src/sensai/data_transformation/sklearn_transformer.py
@@ -30,7 +30,7 @@ def fit(self, arr: TransformableArray):
         pass
 
 
-class ManualScaler:
+class ManualScaler(SkLearnTransformerProtocol):
     """
     A scaler whose parameters are not learnt from data but manually defined
     """

diff --git a/src/sensai/evaluation/evaluator.py b/src/sensai/evaluation/evaluator.py
@@ -266,7 +266,7 @@ def __init__(self,
             fractional split of the data
         :param fractional_split_shuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on
             randomSeed) shuffle the dataset before splitting it
-
+        :param metrics: regression metrics to apply. If None, default regression metrics are used.
         :param additional_metrics: additional regression metrics to apply
         :param output_data_frame_transformer: a data frame transformer to apply to all output data frames (both model outputs and ground
             truth), such that evaluation metrics are computed on the transformed data frame