From ab40f2b4e977b286cd93b553fbc0a09580d515c3 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 21 Feb 2024 19:51:11 +0100 Subject: [PATCH 1/6] Minor, docstrings and types --- src/sensai/data_transformation/dft.py | 40 +++++++++++-------- src/sensai/evaluation/evaluator.py | 2 +- src/sensai/featuregen/feature_generator.py | 28 +++++++------ .../featuregen/feature_generator_registry.py | 4 +- src/sensai/vector_model.py | 2 +- 5 files changed, 42 insertions(+), 34 deletions(-) diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index cfa75f80..4a744ba8 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -509,9 +509,9 @@ def __init__(self, * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application is suitable for the relevant set of features. - Otherwise, specify either ``transformerFactory`` or ``transformer``. + Otherwise, specify either ``transformer_factory`` or ``transformer``. * all relevant features are to be normalised in the same way. - Otherwise, specify ``independentColumns=True``. + Otherwise, specify ``independent_columns=True``. :param skip: flag indicating whether no transformation shall be performed on all of the columns (because they are already normalised) @@ -523,7 +523,7 @@ def __init__(self, NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a separate transformation is to be learned for each of them (rather than using the same transformation for all columns and @@ -554,11 +554,12 @@ def to_placeholder_rule(self): class Rule(ToStringMixin): def __init__(self, regex: Optional[str], - skip=False, unsupported=False, - transformer: SkLearnTransformerProtocol = None, - transformer_factory: Callable[[], SkLearnTransformerProtocol] = None, - array_valued=False, - fit=True, + skip: bool = False, + unsupported: bool = False, + transformer: Optional[SkLearnTransformerProtocol] = None, + transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None, + array_valued: bool = False, + fit: bool = True, independent_columns: Optional[bool] = None): """ :param regex: a regular expression defining the column(s) the rule applies to. @@ -574,16 +575,18 @@ def __init__(self, NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default + `transformer` is not given. If neither `transformer` nor `transformer_factory` are given, the containing instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths). It is assumed that all entries in such arrays are to be normalised in the same way. If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. :param fit: whether the rule's transformer shall be fitted - :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a - separate transformation is to be learned for each of them (rather than using the same transformation for all columns and - learning the transformation from the data of all columns); must be specified for rules matching more than one column, None - is acceptable only for single-column rules + :param independent_columns: handles what should happen if the rule matches multiple columns. If that happens, + this param must be specified to True or False (with None an error will be raised during normalisation). + In that case, if True, a separate transformation will be learned for each of the columns. + If False, a single transformation will be learned from and applied to all matching columns. + For rules matching a single column, + the value of this parameter is irrelevant (and None is acceptable). """ if skip and (transformer is not None or transformer_factory is not None): raise ValueError("skip==True while transformer/transformerFactory is not None") @@ -624,18 +627,21 @@ def matches(self, column: str): def matching_columns(self, columns: Sequence[str]) -> List[str]: return [col for col in columns if self.matches(col)] - def __init__(self, rules: Sequence[Rule], default_transformer_factory=None, require_all_handled=True, inplace=False): + def __init__(self, rules: Sequence[Rule], default_transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None, + require_all_handled: bool = True, inplace: bool = False): """ - :param rules: the set of rules; rules are always fitted and applied in the given order. + :param rules: the set of rules; rules (i.e., their transformers) are always fitted and applied in the given order. A convenient way to obtain a set of rules in the :class:`sensai.vector_model.VectorModel` context is from a :class:`sensai.featuregen.FeatureCollector` or :class:`sensai.featuregen.MultiFeatureGenerator`. + Generally, it is often a good idea to associate rules (or a rule template) with a feature generator. + Then the rules can be obtained from it using `get_normalisation_rules`. :param default_transformer_factory: a factory for the creation of transformer instances (which implements the API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all rules that do not specify a particular transformer. The default transformer will only be applied to columns matched by such rules, unmatched columns will not be transformed. - Use SkLearnTransformerFactoryFactory to conveniently create a factory. - :param require_all_handled: whether to raise an exception if not all columns are matched by a rule + Use :class:`SkLearnTransformerFactoryFactory` to conveniently create a factory. + :param require_all_handled: whether to raise an exception if any column is not matched by a rule :param inplace: whether to apply data frame transformations in-place """ super().__init__() diff --git a/src/sensai/evaluation/evaluator.py b/src/sensai/evaluation/evaluator.py index 3b6f1a73..23f56928 100644 --- a/src/sensai/evaluation/evaluator.py +++ b/src/sensai/evaluation/evaluator.py @@ -266,7 +266,7 @@ def __init__(self, fractional split of the data :param fractional_split_shuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on randomSeed) shuffle the dataset before splitting it - + :param metrics: regression metrics to apply. If None, default regression metrics are used. :param additional_metrics: additional regression metrics to apply :param output_data_frame_transformer: a data frame transformer to apply to all output data frames (both model outputs and ground truth), such that evaluation metrics are computed on the transformed data frame diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index efa28fb7..e24e6cf9 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -32,23 +32,25 @@ class FeatureGenerator(ToStringMixin, ABC): def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), - normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, - add_categorical_default_rules=True): + normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None, + add_categorical_default_rules: bool = True): """ :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names - (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated + (which must not only work for the feature generated by this feature generator, i.e., it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. - Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. - :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). + Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding. + :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model). These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used within a data processing pipeline. They do not affect feature generation. - :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where + :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as - categorical. + categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming + normalisation rules is instantiated and used within a data processing pipeline. + It does not affect feature generation. :param add_categorical_default_rules: If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot - encoded features (with "_" appended) will be added. + encoded features (with "_" appended) will be added. It does not affect feature generation. """ # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide # them later using "with*" methods, this would have the significant drawback that it would enable @@ -86,7 +88,7 @@ def __init__(self, self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+", skip=True)) # rule for one-hot transformation - self._name = None + self._name: Optional[str] = None self._isFitted = False # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe @@ -101,7 +103,7 @@ def _tostring_exclude_private(self) -> bool: def _tostring_additional_entries(self) -> Dict[str, Any]: return dict(name=self.get_name()) - def get_name(self): + def get_name(self) -> str: """ :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators created by a FeatureGeneratorFactory always get the name with which the generator factory was registered. @@ -110,10 +112,10 @@ def get_name(self): return f"{self.__class__.__name__}-{id(self)}" return self._name - def set_name(self, name): + def set_name(self, name: str) -> None: self._name = name - def get_names(self) -> list: + def get_names(self) -> list[str]: """ :return: the list of names of feature generators; will be a list with a single name for a regular feature generator """ @@ -869,7 +871,7 @@ def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained by the returned feature generator :param normalisation_rules: additional normalisation rules for the flattened output columns - :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where + :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where there shall be a single rule that applies to all flattened output columns :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave all other output columns as is. diff --git a/src/sensai/featuregen/feature_generator_registry.py b/src/sensai/featuregen/feature_generator_registry.py index bf274e8b..2dee89e7 100644 --- a/src/sensai/featuregen/feature_generator_registry.py +++ b/src/sensai/featuregen/feature_generator_registry.py @@ -17,7 +17,7 @@ class FeatureGeneratorRegistry: """ Represents a registry for (named) feature generator factories """ - def __init__(self, use_singletons=False): + def __init__(self, use_singletons: bool = False): """ :param use_singletons: if True, internally maintain feature generator singletons, such that there is at most one instance for each name/key @@ -50,7 +50,7 @@ def register_factory(self, name: Hashable, factory: Callable[[], FeatureGenerato raise ValueError(f"Generator for name '{name}' already registered") self._feature_generator_factories[name] = factory - def get_feature_generator(self, name) -> FeatureGenerator: + def get_feature_generator(self, name: str) -> FeatureGenerator: """ Creates a feature generator from a name, which must have been previously registered. The name of the returned feature generator (as returned by getName()) is set to name. diff --git a/src/sensai/vector_model.py b/src/sensai/vector_model.py index ba939384..7df58ed7 100644 --- a/src/sensai/vector_model.py +++ b/src/sensai/vector_model.py @@ -184,7 +184,7 @@ def with_feature_transformers(self: TVectorModel, *transformers: Union[DataFrame self._featureTransformerChain.append(t) return self - @deprecated("Use withFeatureTransformers instead; this method will be removed in a future sensAI release.") + @deprecated("Use with_feature_transformers instead; this method will be removed in a future sensAI release.") def with_input_transformers(self: TVectorModel, *input_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: """ From c439c34c9bf6d0f702cb834c904b6527ad297a93 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Wed, 21 Feb 2024 19:51:34 +0100 Subject: [PATCH 2/6] Let ManualScaler inherit from the corresponding protocol --- src/sensai/data_transformation/sklearn_transformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sensai/data_transformation/sklearn_transformer.py b/src/sensai/data_transformation/sklearn_transformer.py index 49f73e1a..ad06c1f0 100644 --- a/src/sensai/data_transformation/sklearn_transformer.py +++ b/src/sensai/data_transformation/sklearn_transformer.py @@ -30,7 +30,7 @@ def fit(self, arr: TransformableArray): pass -class ManualScaler: +class ManualScaler(SkLearnTransformerProtocol): """ A scaler whose parameters are not learnt from data but manually defined """ From f6b430b6055949eec4df57f080ba419cd3cc32de Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Mon, 26 Feb 2024 20:00:05 +0100 Subject: [PATCH 3/6] Typo --- src/sensai/featuregen/feature_generator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index e24e6cf9..0d462dfa 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -115,7 +115,7 @@ def get_name(self) -> str: def set_name(self, name: str) -> None: self._name = name - def get_names(self) -> list[str]: + def get_names(self) -> List[str]: """ :return: the list of names of feature generators; will be a list with a single name for a regular feature generator """ From 46e708ce2a0fb442a5b2e8045b056f0ed38c5f7a Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Mon, 26 Feb 2024 20:03:12 +0100 Subject: [PATCH 4/6] Adjusted docstrings of DFTNorm rule and rule-template --- src/sensai/data_transformation/dft.py | 54 +++++++++++++++------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index 4a744ba8..83d73b90 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -491,6 +491,8 @@ class DFTNormalisation(DataFrameTransformer): DFTNormalisation ignores N/A values during fitting and application. """ + # TODO: better explanation of independentColumns mechanism + class RuleTemplate: def __init__(self, skip=False, @@ -510,28 +512,32 @@ def __init__(self, * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application is suitable for the relevant set of features. Otherwise, specify either ``transformer_factory`` or ``transformer``. - * all relevant features are to be normalised in the same way. - Otherwise, specify ``independent_columns=True``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. :param skip: flag indicating whether no transformation shall be performed on all of the columns (because they are already normalised) :param unsupported: flag indicating whether normalisation of all columns is unsupported (shall trigger an exception if attempted) - :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s) + :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either - transformerFactory or the containing instance's default factory will be used. - NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same - feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. + transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is + fitted. + NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the + same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a - separate transformation is to be learned for each of them (rather than using the same transformation for all columns and - learning the transformation from the data of all columns); must be specified for rules matching more than one column, - None is acceptable only for a single column + :param independent_columns: only relevant if the resulting rule matches multiple columns, in which case it is required. + If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that + this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted + on the array resulting from selecting the matched columns. + If False, all matching columns are treated as a single feature for the purpose of normalisation. + Thus, all columns will be concatenated before fitting the transformer. """ if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: - raise ValueError("Passed transformer or transformerFactory while skip=True or unsupported=True") + raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") self.skip = skip self.unsupported = unsupported self.transformer = transformer @@ -569,24 +575,26 @@ def __init__(self, :param skip: flag indicating whether no transformation shall be performed on the matching column(s) :param unsupported: flag indicating whether normalisation of the matching column(s) is unsupported (shall trigger an exception if attempted) - :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s) + :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either - transformerFactory or the containing instance's default factory will be used. - NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same - feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. + transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is + fitted. + NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the + same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given. If neither `transformer` nor `transformer_factory` are given, the containing instance's default + `transformer` is not given. If neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths). It is assumed that all entries in such arrays are to be normalised in the same way. If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. :param fit: whether the rule's transformer shall be fitted - :param independent_columns: handles what should happen if the rule matches multiple columns. If that happens, - this param must be specified to True or False (with None an error will be raised during normalisation). - In that case, if True, a separate transformation will be learned for each of the columns. - If False, a single transformation will be learned from and applied to all matching columns. - For rules matching a single column, - the value of this parameter is irrelevant (and None is acceptable). + :param independent_columns: only relevant if the rule matches multiple columns, in which case it is required. + If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that + this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted + on the array resulting from selecting the matched columns. + If False, all matching columns are treated as a single feature for the purpose of normalisation. + Thus, all columns will be concatenated before fitting the transformer. """ if skip and (transformer is not None or transformer_factory is not None): raise ValueError("skip==True while transformer/transformerFactory is not None") From fc72880246cf545a0eec2bc2a8c46c1c8022608c Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Mon, 26 Feb 2024 20:09:02 +0100 Subject: [PATCH 5/6] Fix input validation in Rule --- src/sensai/data_transformation/dft.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index 83d73b90..1150fff3 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -491,8 +491,6 @@ class DFTNormalisation(DataFrameTransformer): DFTNormalisation ignores N/A values during fitting and application. """ - # TODO: better explanation of independentColumns mechanism - class RuleTemplate: def __init__(self, skip=False, @@ -515,9 +513,9 @@ def __init__(self, * the resulting rule will match only a single column. Otherwise, ``independent_columns`` must be specified to True or False. - :param skip: flag indicating whether no transformation shall be performed on all of the columns (because they are already + :param skip: flag indicating whether no transformation shall be performed on matched columns (because they are already normalised) - :param unsupported: flag indicating whether normalisation of all columns is unsupported (shall trigger an exception if + :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if attempted) :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either @@ -596,8 +594,8 @@ def __init__(self, If False, all matching columns are treated as a single feature for the purpose of normalisation. Thus, all columns will be concatenated before fitting the transformer. """ - if skip and (transformer is not None or transformer_factory is not None): - raise ValueError("skip==True while transformer/transformerFactory is not None") + if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: + raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") self.regex = re.compile(regex) if regex is not None else None self.skip = skip self.unsupported = unsupported From fb0ee89f1246b51600bad1d5f2bec549f5829775 Mon Sep 17 00:00:00 2001 From: Michael Panchenko Date: Mon, 26 Feb 2024 21:16:50 +0100 Subject: [PATCH 6/6] DFTNormalisation: * Improve docstrings/comments * Improve type hints * Add parameters 'array_valued' and 'fit' to RuleTemplate --- src/sensai/data_transformation/dft.py | 131 +++++++++++++-------- src/sensai/featuregen/feature_generator.py | 2 +- 2 files changed, 85 insertions(+), 48 deletions(-) diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index 1150fff3..1bd5f0e2 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -479,7 +479,7 @@ def _apply(self, df: pd.DataFrame) -> pd.DataFrame: if self.keep is not None: df = df.loc[self.keep] if self.drop is not None: - df = df.drop(self.drop) + df = df.drop(self.drop) # type: ignore return df @@ -493,47 +493,59 @@ class DFTNormalisation(DataFrameTransformer): class RuleTemplate: def __init__(self, - skip=False, - unsupported=False, + skip: bool = False, + unsupported: bool = False, transformer: Optional[SkLearnTransformerProtocol] = None, transformer_factory: Callable[[], SkLearnTransformerProtocol] = None, - independent_columns: Optional[bool] = None): + independent_columns: Optional[bool] = None, + array_valued: bool = False, + fit: bool = True): """ - Creates a rule template which applies to one or more features/columns (depending on context). - Use parameters as follows: + A template from which a rule which matches multiple columns can be created. + This is useful for the generation of rules which shall apply to all the (numerical) columns generated + by a :class:`FeatureGenerator` without specifically naming them. - * If the relevant features are already normalised, pass ``skip=True`` - * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` - * If the relevant features shall be normalised, the other parameters apply. - No parameters, i.e. ``RuleTemplate()``, are an option if ... + Use the parameters as follows: - * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application - is suitable for the relevant set of features. - Otherwise, specify either ``transformer_factory`` or ``transformer``. - * the resulting rule will match only a single column. Otherwise, ``independent_columns`` - must be specified to True or False. + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters, i.e. ``RuleTemplate()``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. - :param skip: flag indicating whether no transformation shall be performed on matched columns (because they are already - normalised) + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if - attempted) + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is fitted. NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the - same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param independent_columns: only relevant if the resulting rule matches multiple columns, in which case it is required. - If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that - this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted - on the array resulting from selecting the matched columns. - If False, all matching columns are treated as a single feature for the purpose of normalisation. - Thus, all columns will be concatenated before fitting the transformer. + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. + :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a + separate transformation is to be learned for each of them (rather than using the same transformation for all columns and + learning the transformation from the data of all columns). + This parameter must be specified for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ + # NOTE: keep in sync with Rule! if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") self.skip = skip @@ -541,8 +553,10 @@ def __init__(self, self.transformer = transformer self.transformerFactory = transformer_factory self.independentColumns = independent_columns + self.arrayValued = array_valued + self.fit = fit - def to_rule(self, regex: Optional[str]): + def to_rule(self, regex: Optional[Union[str, re.Pattern]]): """ Convert the template to a rule for all columns matching the regex @@ -550,14 +564,14 @@ def to_rule(self, regex: Optional[str]): :return: the resulting Rule """ return DFTNormalisation.Rule(regex, skip=self.skip, unsupported=self.unsupported, transformer=self.transformer, - transformer_factory=self.transformerFactory, independent_columns=self.independentColumns) + transformer_factory=self.transformerFactory, independent_columns=self.independentColumns, array_valued=self.arrayValued, fit=self.fit) def to_placeholder_rule(self): return self.to_rule(None) class Rule(ToStringMixin): def __init__(self, - regex: Optional[str], + regex: Optional[Union[str, re.Pattern]], skip: bool = False, unsupported: bool = False, transformer: Optional[SkLearnTransformerProtocol] = None, @@ -566,37 +580,56 @@ def __init__(self, fit: bool = True, independent_columns: Optional[bool] = None): """ + Use the parameters as follows: + + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters other than regex, i.e. ``Rule(regex)``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. + :param regex: a regular expression defining the column(s) the rule applies to. - If it applies to multiple columns, these columns will be normalised in the same way (using the same normalisation - process for each column) unless independentColumns=True. - If None, the rule is a placeholder rule and the regex must be set later via setRegex or the rule will not be applicable. - :param skip: flag indicating whether no transformation shall be performed on the matching column(s) - :param unsupported: flag indicating whether normalisation of the matching column(s) is unsupported (shall trigger an exception - if attempted) + If it matches multiple columns, these columns will be normalised in the same way (using the same normalisation + process for each column) unless independent_columns=True. + If None, the rule is a placeholder rule and the regex must be set later via set_regex or the rule will not be applicable. + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). + :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is fitted. NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the - same transformer instance to be used in multiple places - e.g., sharing it across several models that use the same + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given. If neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths). - It is assumed that all entries in such arrays are to be normalised in the same way. - If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. - :param fit: whether the rule's transformer shall be fitted - :param independent_columns: only relevant if the rule matches multiple columns, in which case it is required. - If True, the columns are treated independent and a separate transformation is to be learned for each of them. Note that - this doesn't mean each column will get a separate transformer instance! Rather, the transformer will be fitted - on the array resulting from selecting the matched columns. - If False, all matching columns are treated as a single feature for the purpose of normalisation. - Thus, all columns will be concatenated before fitting the transformer. + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. + :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a + separate transformation is to be learned for each of them (rather than using the same transformation for all columns and + learning the transformation from the data of all columns). + This parameter must be specified to for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") - self.regex = re.compile(regex) if regex is not None else None + if isinstance(regex, str): + regex = re.compile(regex) + self.regex = regex + # NOTE: keep in sync with RuleTemplate! self.skip = skip self.unsupported = unsupported self.transformer = transformer @@ -668,6 +701,10 @@ def _tostring_additional_entries(self) -> Dict[str, Any]: def _fit(self, df: pd.DataFrame): matched_rules_by_column = {} self._rules = [] + # For rules matching multiple columns, if independent_columns is False, the columns + # will be concatenated and treated as a single column for fitting the transformer. + # Note that transformers follow sklearn interfaces, thus just passing an array + # to them will learn a per-column-transformation. This will be the case for independent_columns=True. for rule in self._userRules: matching_columns = rule.matching_columns(df.columns) for c in matching_columns: diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index 0d462dfa..bde0ff66 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -36,7 +36,7 @@ def __init__(self, add_categorical_default_rules: bool = True): """ :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names - (which must not only work for the feature generated by this feature generator, i.e., it should not match feature names generated + (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding.