diff --git a/src/sensai/data_transformation/dft.py b/src/sensai/data_transformation/dft.py index cfa75f80..1bd5f0e2 100644 --- a/src/sensai/data_transformation/dft.py +++ b/src/sensai/data_transformation/dft.py @@ -479,7 +479,7 @@ def _apply(self, df: pd.DataFrame) -> pd.DataFrame: if self.keep is not None: df = df.loc[self.keep] if self.drop is not None: - df = df.drop(self.drop) + df = df.drop(self.drop) # type: ignore return df @@ -493,52 +493,70 @@ class DFTNormalisation(DataFrameTransformer): class RuleTemplate: def __init__(self, - skip=False, - unsupported=False, + skip: bool = False, + unsupported: bool = False, transformer: Optional[SkLearnTransformerProtocol] = None, transformer_factory: Callable[[], SkLearnTransformerProtocol] = None, - independent_columns: Optional[bool] = None): + independent_columns: Optional[bool] = None, + array_valued: bool = False, + fit: bool = True): """ - Creates a rule template which applies to one or more features/columns (depending on context). - Use parameters as follows: - - * If the relevant features are already normalised, pass ``skip=True`` - * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` - * If the relevant features shall be normalised, the other parameters apply. - No parameters, i.e. ``RuleTemplate()``, are an option if ... - - * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application - is suitable for the relevant set of features. - Otherwise, specify either ``transformerFactory`` or ``transformer``. - * all relevant features are to be normalised in the same way. - Otherwise, specify ``independentColumns=True``. - - :param skip: flag indicating whether no transformation shall be performed on all of the columns (because they are already - normalised) - :param unsupported: flag indicating whether normalisation of all columns is unsupported (shall trigger an exception if - attempted) - :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s) + A template from which a rule which matches multiple columns can be created. + This is useful for the generation of rules which shall apply to all the (numerical) columns generated + by a :class:`FeatureGenerator` without specifically naming them. + + Use the parameters as follows: + + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters, i.e. ``RuleTemplate()``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. + + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). + :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. + :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either - transformerFactory or the containing instance's default factory will be used. - NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same - feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. + transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is + fitted. + NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same + column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a separate transformation is to be learned for each of them (rather than using the same transformation for all columns and - learning the transformation from the data of all columns); must be specified for rules matching more than one column, - None is acceptable only for a single column + learning the transformation from the data of all columns). + This parameter must be specified for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ + # NOTE: keep in sync with Rule! if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: - raise ValueError("Passed transformer or transformerFactory while skip=True or unsupported=True") + raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") self.skip = skip self.unsupported = unsupported self.transformer = transformer self.transformerFactory = transformer_factory self.independentColumns = independent_columns + self.arrayValued = array_valued + self.fit = fit - def to_rule(self, regex: Optional[str]): + def to_rule(self, regex: Optional[Union[str, re.Pattern]]): """ Convert the template to a rule for all columns matching the regex @@ -546,48 +564,72 @@ def to_rule(self, regex: Optional[str]): :return: the resulting Rule """ return DFTNormalisation.Rule(regex, skip=self.skip, unsupported=self.unsupported, transformer=self.transformer, - transformer_factory=self.transformerFactory, independent_columns=self.independentColumns) + transformer_factory=self.transformerFactory, independent_columns=self.independentColumns, array_valued=self.arrayValued, fit=self.fit) def to_placeholder_rule(self): return self.to_rule(None) class Rule(ToStringMixin): def __init__(self, - regex: Optional[str], - skip=False, unsupported=False, - transformer: SkLearnTransformerProtocol = None, - transformer_factory: Callable[[], SkLearnTransformerProtocol] = None, - array_valued=False, - fit=True, + regex: Optional[Union[str, re.Pattern]], + skip: bool = False, + unsupported: bool = False, + transformer: Optional[SkLearnTransformerProtocol] = None, + transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None, + array_valued: bool = False, + fit: bool = True, independent_columns: Optional[bool] = None): """ + Use the parameters as follows: + + * If the relevant features are already normalised, pass ``skip=True`` + * If the relevant features cannot be normalised (e.g. because they are categorical), pass ``unsupported=True`` + * If the relevant features shall be normalised, the other parameters apply. + No parameters other than regex, i.e. ``Rule(regex)``, are an option if ... + + * a default transformer factory is specified in the :class:`DFTNormalisation` instance and its application + is suitable for the relevant set of features. + Otherwise, specify either ``transformer_factory`` or ``transformer``. + * the resulting rule will match only a single column. Otherwise, ``independent_columns`` + must be specified to True or False. + :param regex: a regular expression defining the column(s) the rule applies to. - If it applies to multiple columns, these columns will be normalised in the same way (using the same normalisation - process for each column) unless independentColumns=True. - If None, the rule is a placeholder rule and the regex must be set later via setRegex or the rule will not be applicable. - :param skip: flag indicating whether no transformation shall be performed on the matching column(s) - :param unsupported: flag indicating whether normalisation of the matching column(s) is unsupported (shall trigger an exception - if attempted) - :param transformer: a transformer instance (from sklearn.preprocessing, e.g. StandardScaler) to apply to the matching column(s) + If it matches multiple columns, these columns will be normalised in the same way (using the same normalisation + process for each column) unless independent_columns=True. + If None, the rule is a placeholder rule and the regex must be set later via set_regex or the rule will not be applicable. + :param skip: flag indicating whether no transformation shall be performed on the matched columns (e.g. because they are already + normalised). + :param unsupported: flag indicating whether normalisation of matched columns is unsupported (shall trigger an exception if + attempted). Useful e.g. for preventing intermediate features that need further processing (like columns containing strings) from making + their way into the final dataframe that will be normalised and used for training a model. + :param transformer: a transformer instance (following the sklearn.preprocessing interface, e.g. StandardScaler) to apply to the matching column(s) for the case where a transformation is necessary (skip=False, unsupported=False). If None is given, either - transformerFactory or the containing instance's default factory will be used. - NOTE: Use an instance only if you want, in particular, the instance to be shared across several models that use the same - feature with associated rule/rule template (disabling `fit` where appropriate). Otherwise, use a factory. + transformer_factory or the containing ``DFTNormalisation`` instance's default factory will be used when the normaliser is + fitted. + NOTE: Using a transformer_factory is usually preferred. Use an instance only if you want the + same transformer instance to be used in multiple places - e.g. sharing it across several models that use the same + column with associated rule/rule template (disabling `fit` where appropriate). :param transformer_factory: a factory for the generation of the transformer instance, which will only be applied if - `transformer` is not given; if neither `transformer` nor `transformerInstance` are given, the containing instance's default + `transformer` is not given; if neither `transformer` nor `transformer_factory` are given, the containing ``DFTNormalisation`` instance's default factory will be used. See :class:`SkLearnTransformerFactoryFactory` for convenient construction options. - :param array_valued: whether the column values are not scalars but arrays (of arbitrary lengths). - It is assumed that all entries in such arrays are to be normalised in the same way. - If arrayValued is True, only a single matching column is supported, i.e. the regex must match at most one column. - :param fit: whether the rule's transformer shall be fitted + :param array_valued: only allowed if the rule matches a single column. If True, it expresses that + column values are not scalars but arrays (of arbitrary lengths). + It is then assumed that all entries in such arrays are to be normalised in the same way, i.e., the same + transformation will be applied to each entry in the array. + :param fit: whether the rule's transformer shall be fitted. One use case for setting this to False is + if a transformer instance instead of a factory is given and the transformer is already fitted. :param independent_columns: whether, for the case where the rule matches multiple columns, the columns are independent and a separate transformation is to be learned for each of them (rather than using the same transformation for all columns and - learning the transformation from the data of all columns); must be specified for rules matching more than one column, None - is acceptable only for single-column rules + learning the transformation from the data of all columns). + This parameter must be specified to for rules matching more than one column, + None is acceptable for rules matching a single column, in which case None, True, and False all have the same effect. """ - if skip and (transformer is not None or transformer_factory is not None): - raise ValueError("skip==True while transformer/transformerFactory is not None") - self.regex = re.compile(regex) if regex is not None else None + if (skip or unsupported) and count_not_none(transformer, transformer_factory) > 0: + raise ValueError("Passed transformer or transformer_factory while skip=True or unsupported=True") + if isinstance(regex, str): + regex = re.compile(regex) + self.regex = regex + # NOTE: keep in sync with RuleTemplate! self.skip = skip self.unsupported = unsupported self.transformer = transformer @@ -624,18 +666,21 @@ def matches(self, column: str): def matching_columns(self, columns: Sequence[str]) -> List[str]: return [col for col in columns if self.matches(col)] - def __init__(self, rules: Sequence[Rule], default_transformer_factory=None, require_all_handled=True, inplace=False): + def __init__(self, rules: Sequence[Rule], default_transformer_factory: Optional[Callable[[], SkLearnTransformerProtocol]] = None, + require_all_handled: bool = True, inplace: bool = False): """ - :param rules: the set of rules; rules are always fitted and applied in the given order. + :param rules: the set of rules; rules (i.e., their transformers) are always fitted and applied in the given order. A convenient way to obtain a set of rules in the :class:`sensai.vector_model.VectorModel` context is from a :class:`sensai.featuregen.FeatureCollector` or :class:`sensai.featuregen.MultiFeatureGenerator`. + Generally, it is often a good idea to associate rules (or a rule template) with a feature generator. + Then the rules can be obtained from it using `get_normalisation_rules`. :param default_transformer_factory: a factory for the creation of transformer instances (which implements the API used by sklearn.preprocessing, e.g. StandardScaler) that shall be used to create a transformer for all rules that do not specify a particular transformer. The default transformer will only be applied to columns matched by such rules, unmatched columns will not be transformed. - Use SkLearnTransformerFactoryFactory to conveniently create a factory. - :param require_all_handled: whether to raise an exception if not all columns are matched by a rule + Use :class:`SkLearnTransformerFactoryFactory` to conveniently create a factory. + :param require_all_handled: whether to raise an exception if any column is not matched by a rule :param inplace: whether to apply data frame transformations in-place """ super().__init__() @@ -656,6 +701,10 @@ def _tostring_additional_entries(self) -> Dict[str, Any]: def _fit(self, df: pd.DataFrame): matched_rules_by_column = {} self._rules = [] + # For rules matching multiple columns, if independent_columns is False, the columns + # will be concatenated and treated as a single column for fitting the transformer. + # Note that transformers follow sklearn interfaces, thus just passing an array + # to them will learn a per-column-transformation. This will be the case for independent_columns=True. for rule in self._userRules: matching_columns = rule.matching_columns(df.columns) for c in matching_columns: diff --git a/src/sensai/data_transformation/sklearn_transformer.py b/src/sensai/data_transformation/sklearn_transformer.py index 49f73e1a..ad06c1f0 100644 --- a/src/sensai/data_transformation/sklearn_transformer.py +++ b/src/sensai/data_transformation/sklearn_transformer.py @@ -30,7 +30,7 @@ def fit(self, arr: TransformableArray): pass -class ManualScaler: +class ManualScaler(SkLearnTransformerProtocol): """ A scaler whose parameters are not learnt from data but manually defined """ diff --git a/src/sensai/evaluation/evaluator.py b/src/sensai/evaluation/evaluator.py index 3b6f1a73..23f56928 100644 --- a/src/sensai/evaluation/evaluator.py +++ b/src/sensai/evaluation/evaluator.py @@ -266,7 +266,7 @@ def __init__(self, fractional split of the data :param fractional_split_shuffle: [if dataSplitter is None, test data must be obtained via split] whether to randomly (based on randomSeed) shuffle the dataset before splitting it - + :param metrics: regression metrics to apply. If None, default regression metrics are used. :param additional_metrics: additional regression metrics to apply :param output_data_frame_transformer: a data frame transformer to apply to all output data frames (both model outputs and ground truth), such that evaluation metrics are computed on the transformed data frame diff --git a/src/sensai/featuregen/feature_generator.py b/src/sensai/featuregen/feature_generator.py index efa28fb7..bde0ff66 100644 --- a/src/sensai/featuregen/feature_generator.py +++ b/src/sensai/featuregen/feature_generator.py @@ -32,23 +32,25 @@ class FeatureGenerator(ToStringMixin, ABC): def __init__(self, categorical_feature_names: Optional[Union[Sequence[str], str]] = None, normalisation_rules: Sequence[data_transformation.DFTNormalisation.Rule] = (), - normalisation_rule_template: data_transformation.DFTNormalisation.RuleTemplate = None, - add_categorical_default_rules=True): + normalisation_rule_template: Optional[data_transformation.DFTNormalisation.RuleTemplate] = None, + add_categorical_default_rules: bool = True): """ :param categorical_feature_names: either a sequence of column names or a regex that is to match all categorical feature names (which must not only work for the feature generated by this feature generator, i.e. it should not match feature names generated by other feature generators). It will be ensured that the respective columns in the generated data frames will have dtype 'category'. - Furthermore, presence of meta-information can later be leveraged for further transformations, e.g. one-hot encoding. - :param normalisation_rules: Rules to be used by DFTNormalisation (e.g. for constructing an input transformer for a model). + Furthermore, the presence of meta-information can later be leveraged for further transformations, e.g., one-hot encoding. + :param normalisation_rules: Rules to be used by DFTNormalisation (e.g.,for constructing an input transformer for a model). These rules are only relevant if a DFTNormalisation object consuming them is instantiated and used within a data processing pipeline. They do not affect feature generation. - :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where + :param normalisation_rule_template: This parameter can be supplied instead of `normalisation_rules` for the case where there shall be a single rule that applies to all columns generated by this feature generator that were not labeled as - categorical. + categorical. Like normalisation_rules, this is only relevant if a DFTNormalisation object consuming + normalisation rules is instantiated and used within a data processing pipeline. + It does not affect feature generation. :param add_categorical_default_rules: If True, normalisation rules for categorical features (which are unsupported by normalisation) and their corresponding one-hot - encoded features (with "_" appended) will be added. + encoded features (with "_" appended) will be added. It does not affect feature generation. """ # NOTE: While it would be more elegant to not have all of the above constructor arguments and instead provide # them later using "with*" methods, this would have the significant drawback that it would enable @@ -86,7 +88,7 @@ def __init__(self, self._categoricalFeatureRules.append(data_transformation.DFTNormalisation.Rule(categorical_feature_name_regex + r"_\d+", skip=True)) # rule for one-hot transformation - self._name = None + self._name: Optional[str] = None self._isFitted = False # for backwards compatibility with persisted Featuregens based on code prior to commit 7088cbbe @@ -101,7 +103,7 @@ def _tostring_exclude_private(self) -> bool: def _tostring_additional_entries(self) -> Dict[str, Any]: return dict(name=self.get_name()) - def get_name(self): + def get_name(self) -> str: """ :return: the name of this feature generator, which may be a default name if the name has not been set. Note that feature generators created by a FeatureGeneratorFactory always get the name with which the generator factory was registered. @@ -110,10 +112,10 @@ def get_name(self): return f"{self.__class__.__name__}-{id(self)}" return self._name - def set_name(self, name): + def set_name(self, name: str) -> None: self._name = name - def get_names(self) -> list: + def get_names(self) -> List[str]: """ :return: the list of names of feature generators; will be a list with a single name for a regular feature generator """ @@ -869,7 +871,7 @@ def flattened_feature_generator(fgen: FeatureGenerator, columns_to_flatten: List :param keep_other_columns: whether any additional columns that are not to be flattened are to be retained by the returned feature generator :param normalisation_rules: additional normalisation rules for the flattened output columns - :param normalisation_rule_template: This parameter can be supplied instead of normalisationRules for the case where + :param normalisation_rule_template: This parameter can be supplied instead of normalisation_rules for the case where there shall be a single rule that applies to all flattened output columns :return: FeatureGenerator instance that will generate flattened versions of the specified columns and leave all other output columns as is. diff --git a/src/sensai/featuregen/feature_generator_registry.py b/src/sensai/featuregen/feature_generator_registry.py index bf274e8b..2dee89e7 100644 --- a/src/sensai/featuregen/feature_generator_registry.py +++ b/src/sensai/featuregen/feature_generator_registry.py @@ -17,7 +17,7 @@ class FeatureGeneratorRegistry: """ Represents a registry for (named) feature generator factories """ - def __init__(self, use_singletons=False): + def __init__(self, use_singletons: bool = False): """ :param use_singletons: if True, internally maintain feature generator singletons, such that there is at most one instance for each name/key @@ -50,7 +50,7 @@ def register_factory(self, name: Hashable, factory: Callable[[], FeatureGenerato raise ValueError(f"Generator for name '{name}' already registered") self._feature_generator_factories[name] = factory - def get_feature_generator(self, name) -> FeatureGenerator: + def get_feature_generator(self, name: str) -> FeatureGenerator: """ Creates a feature generator from a name, which must have been previously registered. The name of the returned feature generator (as returned by getName()) is set to name. diff --git a/src/sensai/vector_model.py b/src/sensai/vector_model.py index ba939384..7df58ed7 100644 --- a/src/sensai/vector_model.py +++ b/src/sensai/vector_model.py @@ -184,7 +184,7 @@ def with_feature_transformers(self: TVectorModel, *transformers: Union[DataFrame self._featureTransformerChain.append(t) return self - @deprecated("Use withFeatureTransformers instead; this method will be removed in a future sensAI release.") + @deprecated("Use with_feature_transformers instead; this method will be removed in a future sensAI release.") def with_input_transformers(self: TVectorModel, *input_transformers: Union[DataFrameTransformer, List[DataFrameTransformer]]) -> TVectorModel: """