From 59e96c59d53cef18eba279f9de297d16609ff889 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Mon, 26 Feb 2024 21:16:50 +0100 Subject: [PATCH] DFTNormalisation: * Improve docstrings/comments * Improve type hints * Add parameters 'array_valued' and 'fit' to RuleTemplate --- src/sensai/data_transformation/dft.py | 133 +++++++++++++-------- src/sensai/featuregen/feature_generator.py | 2 +- 2 files changed, 86 insertions(+), 49 deletions(-) diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index 1150fff3..f5eebdb7 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -479,7 +479,7 @@ def _apply(self, df: pd.DataFrame) -> pd.DataFrame: if self.keep is not None: df = df.loc[self.keep] if self.drop is not None: - df = df.drop(self.drop) + df = df.drop(self.drop) # type: ignore return df @@ -493,47 +493,59 @@ class DFTNormalisation(DataFrameTransformer): class RuleTemplate: def __init__(self, - skip=False, - unsupported=False, + skip: bool = False, + unsupported: bool = False, transformer: Optional[SkLearnTransformerProtocol] = None, transformer_factory: Callable[[], SkLearnTransformerProtocol] = None, - independent_columns: Optional[bool] = None): + independent_columns: Optional[bool] = None, + array_valued: bool = False, + fit: bool = True): """ - Creates a rule template which applies to one or more features/columns (depending on context). - Use parameters as follows: + A template from which a rule which matches multiple columns can be created. + This is useful for the generation of rules which shall apply to all the (numerical) columns generated + by a :class:`FeatureGenerator` without specifically naming them. - * If the relevant features are already normalised, pass ``skip=True`` - * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` - * If the relevant features shall be normalised, the other parameters apply. - No parameters, i.e. ``RuleTemplate()``, are an option if ... + Use the parameters as follows: - * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application - is suitable for the relevant set of features. - Otherwise, specify either ``transformer_factory`` or ``transformer``. - * the resulting rule will match only a single column. Otherwise, ``independent_columns`` - must be specified to True or False. + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters, i.e. ``RuleTemplate()``, are an option if ... - :param skip: flag indicating whether no transformation shall be performed on matched columns (because they are already - normalised) + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. + + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if - attempted) + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is fitted. NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the - same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param independent_columns: only relevant if the resulting rule matches multiple columns, in which case it is required. - If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that - this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted - on the array resulting from selecting the matched columns. - If False, all matching columns are treated as a single feature for the purpose of normalisation. - Thus, all columns will be concatenated before fitting the transformer. + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. + :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a + separate transformation is to be learned for each of them (rather than using the same transformation for all columns and + learning the transformation from the data of all columns). + This parameter must be specified for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ + # NOTE: keep in sync with Rule! if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") self.skip = skip @@ -541,8 +553,10 @@ def __init__(self, self.transformer = transformer self.transformerFactory = transformer_factory self.independentColumns = independent_columns + self.arrayValued = array_valued + self.fit = fit - def to_rule(self, regex: Optional[str]): + def to_rule(self, regex: Optional[Union[str, re.Pattern]]): """ Convert the template to a rule for all columns matching the regex @@ -550,14 +564,14 @@ def to_rule(self, regex: Optional[str]): :return: the resulting Rule """ return DFTNormalisation.Rule(regex, skip=self.skip, unsupported=self.unsupported, transformer=self.transformer, - transformer_factory=self.transformerFactory, independent_columns=self.independentColumns) + transformer_factory=self.transformerFactory, independent_columns=self.independentColumns, array_valued=self.arrayValued, fit=self.fit) def to_placeholder_rule(self): return self.to_rule(None) class Rule(ToStringMixin): def __init__(self, - regex: Optional[str], + regex: Optional[Union[str, re.Pattern]], skip: bool = False, unsupported: bool = False, transformer: Optional[SkLearnTransformerProtocol] = None, @@ -566,44 +580,63 @@ def __init__(self, fit: bool = True, independent_columns: Optional[bool] = None): """ + Use the parameters as follows: + + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters other than regex, i.e. ``Rule(regex)``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. + :param regex: a regular expression defining the column(s) the rule applies to. - If it applies to multiple columns, these columns will be normalised in the same way (using the same normalisation - process for each column) unless independentColumns=True. - If None, the rule is a placeholder rule and the regex must be set later via setRegex or the rule will not be applicable. - :param skip: flag indicating whether no transformation shall be performed on the matching column(s) - :param unsupported: flag indicating whether normalisation of the matching column(s) is unsupported (shall trigger an exception - if attempted) + If it matches multiple columns, these columns will be normalised in the same way (using the same normalisation + process for each column) unless independent_columns=True. + If None, the rule is a placeholder rule and the regex must be set later via set_regex or the rule will not be applicable. + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). + :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is fitted. NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the - same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given. If neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths). - It is assumed that all entries in such arrays are to be normalised in the same way. - If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. - :param fit: whether the rule's transformer shall be fitted - :param independent_columns: only relevant if the rule matches multiple columns, in which case it is required. - If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that - this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted - on the array resulting from selecting the matched columns. - If False, all matching columns are treated as a single feature for the purpose of normalisation. - Thus, all columns will be concatenated before fitting the transformer. + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. + :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a + separate transformation is to be learned for each of them (rather than using the same transformation for all columns and + learning the transformation from the data of all columns). + This parameter must be specified to for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") - self.regex = re.compile(regex) if regex is not None else None + if isinstance(regex, str): + regex = re.compile(regex) + self.regex = regex + # NOTE: keep in sync with RuleTemplate! self.skip = skip self.unsupported = unsupported self.transformer = transformer self.transformerFactory = transformer_factory self.arrayValued = array_valued self.fit = fit - self.independentColumns = independent_columns + self.independentColumns = independent_columns def __setstate__(self, state): setstate(DFTNormalisation.Rule, self, state, new_default_properties=dict(arrayValued=False, fit=True, independentColumns=False, @@ -668,6 +701,10 @@ def _tostring_additional_entries(self) -> Dict[str, Any]: def _fit(self, df: pd.DataFrame): matched_rules_by_column = {} self._rules = [] + # For rules matching multiple columns, if independent_columns is False, the columns + # will be concatenated and treated as a single column for fitting the transformer. + # Note that transformers follow sklearn interfaces, thus just passing an array + # to them will learn a per-column-transformation. This will be the case for independent_columns=True. for rule in self._userRules: matching_columns = rule.matching_columns(df.columns) for c in matching_columns: diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index 0d462dfa..bde0ff66 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -36,7 +36,7 @@ def __init__(self, add_categorical_default_rules: bool = True): """ :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names - (which must not only work for the feature generated by this feature generator, i.e., it should not match feature names generated + (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding.