From 1f307a18bc67e88cb10a7001c9b25adff03a4e65 Mon Sep 17 00:00:00 2001 From: Alexander Guschin <1aguschin@gmail.com> Date: Thu, 13 Jul 2023 11:03:01 +0600 Subject: [PATCH] Support IsolationForest (#693) close https://github.com/iterative/mlem.ai/issues/353 See also https://github.com/iterative/mlem/issues/423#issuecomment-1610920683 --- .pre-commit-config.yaml | 2 +- mlem/contrib/sklearn.py | 4 ++-- tests/contrib/test_sklearn.py | 21 +++++++++++++++++++-- 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2356a3c3..8fbeb621 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -44,7 +44,7 @@ repos: - types-requests - types-six - types-PyYAML - - pydantic + - pydantic>=1.9.0,<2 - types-filelock - types-emoji - repo: local diff --git a/mlem/contrib/sklearn.py b/mlem/contrib/sklearn.py index a027174e..926a8493 100644 --- a/mlem/contrib/sklearn.py +++ b/mlem/contrib/sklearn.py @@ -6,7 +6,7 @@ from typing import Any, ClassVar, Dict, List, Optional, Union import sklearn -from sklearn.base import ClassifierMixin, RegressorMixin +from sklearn.base import ClassifierMixin, OutlierMixin, RegressorMixin from sklearn.feature_extraction.text import TransformerMixin, _VectorizerMixin from sklearn.pipeline import Pipeline from sklearn.preprocessing._encoders import _BaseEncoder @@ -28,7 +28,7 @@ class SklearnModel(ModelType, ModelHook, IsInstanceHookMixin): """ModelType implementation for `scikit-learn` models""" type: ClassVar[str] = "sklearn" - valid_types: ClassVar = (RegressorMixin, ClassifierMixin) + valid_types: ClassVar = (RegressorMixin, ClassifierMixin, OutlierMixin) io: ModelIO = SimplePickleIO() """IO""" diff --git a/tests/contrib/test_sklearn.py b/tests/contrib/test_sklearn.py index 3f3c1a25..3f3da55d 100644 --- a/tests/contrib/test_sklearn.py +++ b/tests/contrib/test_sklearn.py @@ -3,6 +3,7 @@ import lightgbm as lgb import numpy as np import pytest +from sklearn.ensemble import IsolationForest from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import LinearRegression, LogisticRegression from sklearn.pipeline import Pipeline @@ -54,6 +55,13 @@ def regressor(inp_data, out_data): return lr +@pytest.fixture +def outlier(inp_data): + model = IsolationForest() + model.fit(inp_data) + return model + + @pytest.fixture def count_vectorizer(text_inp_data): vectorizer = CountVectorizer() @@ -195,7 +203,9 @@ def test_hook_lgb(lgbm_model, inp_data): assert signature.returns == returns -@pytest.mark.parametrize("model", ["classifier", "regressor", "pipeline"]) +@pytest.mark.parametrize( + "model", ["classifier", "regressor", "pipeline", "outlier"] +) def test_model_type__predict(model, inp_data, request): model = request.getfixturevalue(model) model_type = ModelAnalyzer.analyze(model, sample_data=inp_data) @@ -221,7 +231,14 @@ def test_model_type__reg_predict_proba(regressor, inp_data): model_type.call_method("predict_proba", inp_data) -@pytest.mark.parametrize("model", ["classifier", "regressor"]) +def test_model_type__outlier_predict_proba(outlier, inp_data): + model_type = ModelAnalyzer.analyze(outlier, sample_data=inp_data) + + with pytest.raises(ValueError): + model_type.call_method("predict_proba", inp_data) + + +@pytest.mark.parametrize("model", ["classifier", "regressor", "outlier"]) def test_model_type__dump_load(tmpdir, model, inp_data, request): model = request.getfixturevalue(model) model_type = ModelAnalyzer.analyze(model, sample_data=inp_data)