From 95111ffa994112b3e94248f82fed99b39f8f2793 Mon Sep 17 00:00:00 2001 From: boccaff Date: Sat, 2 Nov 2024 17:08:40 +0000 Subject: [PATCH] feat: add support to sklearn TargetEncoder Signed-off-by: boccaff --- skl2onnx/_supported_operators.py | 2 + skl2onnx/operator_converters/__init__.py | 2 + .../operator_converters/target_encoder.py | 100 +++++++ skl2onnx/shape_calculators/__init__.py | 2 + skl2onnx/shape_calculators/target_encoder.py | 30 ++ .../test_sklearn_target_encoder_converter.py | 265 ++++++++++++++++++ .../test_utils/reference_implementation_ml.py | 46 +++ tests/test_utils/utils_backend_onnx.py | 1 + 8 files changed, 448 insertions(+) create mode 100644 skl2onnx/operator_converters/target_encoder.py create mode 100644 skl2onnx/shape_calculators/target_encoder.py create mode 100644 tests/test_sklearn_target_encoder_converter.py diff --git a/skl2onnx/_supported_operators.py b/skl2onnx/_supported_operators.py index 7343bb1e8..2d3650cf7 100644 --- a/skl2onnx/_supported_operators.py +++ b/skl2onnx/_supported_operators.py @@ -244,6 +244,7 @@ LabelEncoder, Normalizer, OneHotEncoder, + TargetEncoder, ) try: @@ -511,6 +512,7 @@ def build_sklearn_operator_name_map(): RidgeClassifierCV: "SklearnLinearClassifier", SGDRegressor: "SklearnLinearRegressor", StandardScaler: "SklearnScaler", + TargetEncoder: "SklearnTargetEncoder", TheilSenRegressor: "SklearnLinearRegressor", } ) diff --git a/skl2onnx/operator_converters/__init__.py b/skl2onnx/operator_converters/__init__.py index 0a9dadfbd..f5a78851e 100644 --- a/skl2onnx/operator_converters/__init__.py +++ b/skl2onnx/operator_converters/__init__.py @@ -61,6 +61,7 @@ from . import sgd_oneclass_svm from . import stacking from . import support_vector_machines +from . import target_encoder from . import text_vectoriser from . import tfidf_transformer from . import tfidf_vectoriser @@ -128,6 +129,7 @@ sgd_oneclass_svm, stacking, support_vector_machines, + target_encoder, text_vectoriser, tfidf_transformer, tfidf_vectoriser, diff --git a/skl2onnx/operator_converters/target_encoder.py b/skl2onnx/operator_converters/target_encoder.py new file mode 100644 index 000000000..c64fac6f8 --- /dev/null +++ b/skl2onnx/operator_converters/target_encoder.py @@ -0,0 +1,100 @@ +# SPDX-License-Identifier: Apache-2.0 +import numpy as np + +from ..common._apply_operation import apply_cast, apply_concat, apply_reshape +from ..common._container import ModelComponentContainer +from ..common.data_types import ( + FloatTensorType, + Int64TensorType, +) +from ..common._registration import register_converter +from ..common._topology import Scope, Operator +from ..proto import onnx_proto + + +def convert_sklearn_target_encoder( + scope: Scope, operator: Operator, container: ModelComponentContainer +): + op = operator.raw_operator + result = [] + input_idx = 0 + dimension_idx = 0 + + # NotImplementedError( # TODO: assert that we have binary output + if (op.target_type_ == "multiclass") or ( + isinstance(op.classes_.dtype, np.int64) and (len(op.classes_) > 2) + ): + raise NotImplementedError("multiclass TargetEncoder is not supported") + for categories, encodings in zip(op.categories_, op.encodings_): + if len(categories) == 0: + continue + + current_input = operator.inputs[input_idx] + if current_input.get_second_dimension() == 1: + feature_column = current_input + input_idx += 1 + else: + index_name = scope.get_unique_variable_name("index") + container.add_initializer( + index_name, onnx_proto.TensorProto.INT64, [], [dimension_idx] + ) + + feature_column = scope.declare_local_variable( + "feature_column", + current_input.type.__class__([current_input.get_first_dimension(), 1]), + ) + + container.add_node( + "ArrayFeatureExtractor", + [current_input.onnx_name, index_name], + feature_column.onnx_name, + op_domain="ai.onnx.ml", + name=scope.get_unique_operator_name("ArrayFeatureExtractor"), + ) + + dimension_idx += 1 + if dimension_idx == current_input.get_second_dimension(): + dimension_idx = 0 + input_idx += 1 + + attrs = {"name": scope.get_unique_operator_name("LabelEncoder")} + if isinstance(feature_column.type, FloatTensorType): + attrs["keys_floats"] = np.array([float(s) for s in categories], dtype=np.float32) + elif isinstance(feature_column.type, Int64TensorType): + attrs["keys_int64s"] = np.array([int(s) for s in categories], dtype=np.int64) + else: + attrs["keys_strings"] = np.array([str(s).encode("utf-8") for s in categories]) + attrs["values_floats"] = encodings + attrs["default_float"] = op.target_mean_ + + result.append(scope.get_unique_variable_name("ordinal_output")) + label_encoder_output = scope.get_unique_variable_name("label_encoder") + + container.add_node( + "LabelEncoder", + feature_column.onnx_name, + label_encoder_output, + op_domain="ai.onnx.ml", + op_version=2, + **attrs, + ) + apply_reshape( + scope, + label_encoder_output, + result[-1], + container, + desired_shape=(-1, 1), + ) + + concat_result_name = scope.get_unique_variable_name("concat_result") + apply_concat(scope, result, concat_result_name, container, axis=1) + apply_cast( + scope, + concat_result_name, + operator.output_full_names, + container, + to=onnx_proto.TensorProto.FLOAT, + ) + + +register_converter("SklearnTargetEncoder", convert_sklearn_target_encoder) diff --git a/skl2onnx/shape_calculators/__init__.py b/skl2onnx/shape_calculators/__init__.py index ab5556b1e..cc85afdfd 100644 --- a/skl2onnx/shape_calculators/__init__.py +++ b/skl2onnx/shape_calculators/__init__.py @@ -47,6 +47,7 @@ from . import sgd_oneclass_svm from . import svd from . import support_vector_machines +from . import target_encoder from . import text_vectorizer from . import tuned_threshold_classifier from . import tfidf_transformer @@ -99,6 +100,7 @@ sgd_oneclass_svm, svd, support_vector_machines, + target_encoder, text_vectorizer, tfidf_transformer, tuned_threshold_classifier, diff --git a/skl2onnx/shape_calculators/target_encoder.py b/skl2onnx/shape_calculators/target_encoder.py new file mode 100644 index 000000000..b559a240c --- /dev/null +++ b/skl2onnx/shape_calculators/target_encoder.py @@ -0,0 +1,30 @@ +# SPDX-License-Identifier: Apache-2.0 + + +import copy +from ..common._registration import register_shape_calculator +from ..common.data_types import FloatTensorType +from ..common.data_types import Int64TensorType, StringTensorType +from ..common.utils import check_input_and_output_numbers +from ..common.utils import check_input_and_output_types + + +def calculate_sklearn_target_encoder_output_shapes(operator): + """ + This function just copy the input shape to the output because target + encoder only alters input features' values, not their shape. + """ + check_input_and_output_numbers(operator, output_count_range=1) + check_input_and_output_types( + operator, good_input_types=[FloatTensorType, Int64TensorType, StringTensorType] + ) + + N = operator.inputs[0].get_first_dimension() + shape = [N, len(operator.raw_operator.categories_)] + + operator.outputs[0].type = FloatTensorType(shape=shape) + + +register_shape_calculator( + "SklearnTargetEncoder", calculate_sklearn_target_encoder_output_shapes +) diff --git a/tests/test_sklearn_target_encoder_converter.py b/tests/test_sklearn_target_encoder_converter.py new file mode 100644 index 000000000..b658736ec --- /dev/null +++ b/tests/test_sklearn_target_encoder_converter.py @@ -0,0 +1,265 @@ +# SPDX-License-Identifier: Apache-2.0 + +"""Tests scikit-learn's TargetEncoder converter.""" + +from skl2onnx import convert_sklearn, to_onnx +from sklearn.ensemble import RandomForestRegressor +from sklearn.pipeline import make_pipeline +from sklearn.compose import ColumnTransformer +from sklearn import __version__ as sklearn_version +import onnxruntime +import pandas as pd +from numpy.testing import assert_almost_equal +import unittest +import packaging.version as pv +import numpy as np +from onnxruntime import __version__ as ort_version +from skl2onnx.common.data_types import ( + Int64TensorType, + StringTensorType, +) +from test_utils import dump_data_and_model, TARGET_OPSET + +try: + from sklearn.preprocessing import TargetEncoder +except ImportError: + pass + + +ort_version = ".".join(ort_version.split(".")[:2]) + + +def target_encoder_support(): + # pv.Version does not work with development versions + vers = ".".join(sklearn_version.split(".")[:2]) + if pv.Version(vers) < pv.Version("1.5.0"): + return False + if pv.Version(onnxruntime.__version__) < pv.Version("0.3.0"): + return False + return pv.Version(vers) >= pv.Version("1.5.0") + + +def set_output_support(): + vers = ".".join(sklearn_version.split(".")[:2]) + return pv.Version(vers) >= pv.Version("1.2") + + +class TestSklearnTargetEncoderConverter(unittest.TestCase): + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + def test_model_target_encoder(self): + model = TargetEncoder() + X = np.array(["str3", "str2", "str0", "str1", "str3"]).reshape(-1, 1) + y = np.array([0.0, 1.0, 1.0, 0.0, 1.0]) + model.fit(X, y) + model_onnx = convert_sklearn( + model, + "scikit-learn target encoder", + [("input", StringTensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET, + ) + self.assertTrue(model_onnx is not None) + self.assertTrue(model_onnx.graph.node is not None) + if model_onnx.ir_version >= 7 and TARGET_OPSET < 12: + raise AssertionError("Incompatbilities") + dump_data_and_model(X, model, model_onnx, + basename="SklearnTargetEncoder") + + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + def test_model_target_encoder_int(self): + model = TargetEncoder() + X = np.array([0, 0, 1, 0, 0, 1, 1], dtype=np.int64).reshape(-1, 1) + y = np.array([1, 1, 1, 0, 0, 0, 0], dtype=np.int64) + X_test = np.array([0, 1, 2, 1, 0], dtype=np.int64).reshape(-1, 1) + + model.fit(X, y) + model_onnx = convert_sklearn( + model, + "scikit-learn label encoder", + [("input", Int64TensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET, + ) + self.assertTrue(model_onnx is not None) + self.assertTrue(model_onnx.graph.node is not None) + if model_onnx.ir_version >= 7 and TARGET_OPSET < 12: + raise AssertionError("Incompatbilities") + dump_data_and_model(X_test, model, model_onnx, + basename="SklearnTargetEncoderInt") + + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + def test_target_encoder_twocats(self): + data = [["cat2"], ["cat1"]] + label = [0, 1] + model = TargetEncoder(categories="auto") + model.fit(data, label) + inputs = [("input1", StringTensorType([None, 1]))] + model_onnx = convert_sklearn( + model, "ordinal encoder two string cats", inputs, target_opset=TARGET_OPSET + ) + self.assertTrue(model_onnx is not None) + dump_data_and_model( + data, model, model_onnx, basename="SklearnTargetEncoderTwoStringCat" + ) + + @unittest.skipIf( + not set_output_support(), + reason="'ColumnTransformer' object has no attribute 'set_output'", + ) + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + def test_target_encoder_pipeline_int64(self): + from onnxruntime import InferenceSession + + data = pd.DataFrame( + {"cat": ["cat2", "cat1"] * 10, "num": [0, 1, 1, 0] * 5}) + data["num"] = data["num"].astype(np.float32) + y = np.array([0, 1, 0, 1] * 5, dtype=np.float32) + # target encoder uses cross-fitting and have cv=5 as default, which + # caused some folds to have constant y. + preprocessor = ColumnTransformer( + transformers=[ + ("cat", TargetEncoder(cv=2), ["cat"]), + ("num", "passthrough", ["num"]), + ], + sparse_threshold=1, + verbose_feature_names_out=False, + ).set_output(transform="pandas") + model = make_pipeline( + preprocessor, RandomForestRegressor(n_estimators=3, max_depth=2) + ) + model.fit(data, y) + expected = model.predict(data) + model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET) + sess = InferenceSession( + model_onnx.SerializeToString(), providers=["CPUExecutionProvider"] + ) + got = sess.run( + None, + { + "cat": data["cat"].values.reshape((-1, 1)), + "num": data["num"].values.reshape((-1, 1)), + }, + ) + assert_almost_equal(expected, got[0].ravel()) + + @unittest.skipIf( + not set_output_support(), + reason="'ColumnTransformer' object has no attribute 'set_output'", + ) + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + def test_target_encoder_pipeline_string_int64(self): + from onnxruntime import InferenceSession + + data = pd.DataFrame( + { + "C1": ["cat2", "cat1", "cat3"] * 10, + "C2": [1, 0, 1] * 10, + "num": [0, 1, 1] * 10, + } + ) + data["num"] = data["num"].astype(np.float32) + data["C2"] = data["C2"].astype(np.int64) + y = np.array([0, 1, 0, 1, 0, 1] * 5, dtype=np.float32) + preprocessor = ColumnTransformer( + transformers=[ + ("cat", TargetEncoder(cv=2), ["C1", "C2"]), + ("num", "passthrough", ["num"]), + ], + sparse_threshold=1, + verbose_feature_names_out=False, + ).set_output(transform="pandas") + model = make_pipeline( + preprocessor, RandomForestRegressor(n_estimators=3, max_depth=2) + ) + model.fit(data, y) + expected = model.predict(data) + model_onnx = to_onnx(model, data, target_opset=TARGET_OPSET) + sess = InferenceSession( + model_onnx.SerializeToString(), providers=["CPUExecutionProvider"] + ) + got = sess.run( + None, + { + "C1": data["C1"].values.reshape((-1, 1)), + "C2": data["C2"].values.reshape((-1, 1)), + "num": data["num"].values.reshape((-1, 1)), + }, + ) + assert_almost_equal(expected, got[0].ravel()) + + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + @unittest.skipIf(TARGET_OPSET < 9, reason="not available") + def test_target_encoder_mixed_string_int_pandas(self): + col1 = "col1" + col2 = "col2" + col3 = "col3" + data_pd = pd.DataFrame( + { + col1: np.array(["c0.4", "c1.4", "c0.2", "c0.2", "c0.2", "c0.2"]), + col2: np.array(["c0.2", "c1.2", "c2.2", "c2.2", "c2.2", "c2.2"]), + col3: np.array([3, 0, 1, 1, 1, 1]), + } + ) + data_label = np.array([0, 0, 1, 1, 1, 0]) + test_pd = pd.DataFrame( + { + col1: np.array(["c0.2"]), + col2: np.array(["c2.2"]), + col3: np.array([1]), + } + ) + model = TargetEncoder() + model.fit(data_pd, data_label) + inputs = [ + ("input1", StringTensorType([None, 2])), + ("input2", Int64TensorType([None, 1])), + ] + model_onnx = convert_sklearn( + model, "ordinal encoder", inputs, target_opset=TARGET_OPSET + ) + self.assertIsNotNone(model_onnx) + dump_data_and_model( + test_pd, + model, + model_onnx, + basename="SklearnTargetEncoderMixedStringIntPandas", + ) + + @unittest.skipIf( + not target_encoder_support(), + reason="TargetEncoder was not available before 1.5", + ) + @unittest.skipIf(TARGET_OPSET < 9, reason="not available") + def test_target_encoder_multiclass_assertion(self): + model = TargetEncoder() + X = np.array([0, 0, 1, 0, 0, 1, 1], dtype=np.int64).reshape(-1, 1) + y = np.array([0, 1, 2, 0, 1, 2, 0], dtype=np.int64) + + model.fit(X, y) + with self.assertRaises(NotImplementedError): + model_onnx = convert_sklearn( + model, + "scikit-learn label encoder", + [("input", Int64TensorType([None, X.shape[1]]))], + target_opset=TARGET_OPSET, + ) + + +if __name__ == "__main__": + unittest.main(verbosity=2) diff --git a/tests/test_utils/reference_implementation_ml.py b/tests/test_utils/reference_implementation_ml.py index 9399c760a..9e0d3a3bf 100644 --- a/tests/test_utils/reference_implementation_ml.py +++ b/tests/test_utils/reference_implementation_ml.py @@ -419,3 +419,49 @@ def _run(self, x, int64_vocabulary=None, string_vocabulary=None): return (np.array(res),) raise TypeError(f"x must be iterable not {type(x)}.") # pragma: no cover + + + class TargetEncoder(OpRun): + op_domain = "ai.onnx.ml" + + def _run( + self, + x, + default_float=None, + default_int64=None, + default_string=None, + keys_floats=None, + keys_int64s=None, + keys_strings=None, + values_floats=None, + values_int64s=None, + values_strings=None, + ): + keys = keys_floats or keys_int64s or keys_strings + values = values_floats or values_int64s or values_strings + classes = {k: v for k, v in zip(keys, values)} + if id(keys) == id(keys_floats): + cast = float + elif id(keys) == id(keys_int64s): + cast = int + else: + cast = str + if id(values) == id(values_floats): + defval = default_float + dtype = np.float32 + elif id(values) == id(values_int64s): + defval = default_int64 + dtype = np.int64 + else: + defval = default_string + if not isinstance(defval, str): + defval = "" + dtype = np.str_ + shape = x.shape + if len(x.shape) > 1: + x = x.flatten() + res = [] + for i in range(0, x.shape[0]): + v = classes.get(cast(x[i]), defval) + res.append(v) + return (np.array(res, dtype=dtype).reshape(shape),) diff --git a/tests/test_utils/utils_backend_onnx.py b/tests/test_utils/utils_backend_onnx.py index 914e98592..86a571878 100644 --- a/tests/test_utils/utils_backend_onnx.py +++ b/tests/test_utils/utils_backend_onnx.py @@ -292,6 +292,7 @@ def _extract_attribute_value(self, att, ref_att=None): LinearRegressor, Normalizer, OneHotEncoder, + TargetEncoder, TfIdfVectorizer, Scaler, Scan,