[GSK-1711] Added check of push output (#1394)

* Added check of push output * Fixing seed for model used in fixtures * Switch hash to hashlib.sha1 to get stable output --------- Co-authored-by: Bazire <[email protected]>
Giskard-AI · Sep 29, 2023 · 22c6678 · 22c6678
1 parent 18a9e05
commit 22c6678
Show file tree

Hide file tree

Showing 9 changed files with 87 additions and 138 deletions.
diff --git a/python-client/giskard/push/perturbation.py b/python-client/giskard/push/perturbation.py
@@ -10,9 +10,10 @@
 """
 from typing import Optional
 
+import hashlib
+
 import numpy as np
 import pandas as pd
-import sys
 
 from giskard.core.core import SupportedModelTypes
 from giskard.datasets.base import Dataset
@@ -31,6 +32,7 @@
     TextTypoTransformation,
     TextUppercase,
 )
+
 from ..push import PerturbationPush
 
 text_transformation_list = [
@@ -173,10 +175,16 @@ def _text(
             # TextTypoTransformation generates a random typo for text features. In order to have the same typo per
             # sample with the push feature in the debugger, we need to generate a unique seed per sample (hashed_seed)
             # to guarantee the same perturbation per sample.
-            hashed_seed = hash(f"{', '.join(map(lambda x: repr(x), ds_slice_copy.df.values))}".encode("utf-8"))
-            # hash could give negative ints, and np.random.seed accepts only positive ints
-            positive_hashed_seed = hashed_seed % ((sys.maxsize + 1) * 2)
-            kwargs = {"rng_seed": positive_hashed_seed}
+            # SHA1 is used here, since it does not matter that there are collisions
+            hashed_seed = int.from_bytes(
+                hashlib.sha1(
+                    (f"{', '.join(map(lambda x: repr(x), ds_slice_copy.df.values))}".encode("utf-8"))
+                ).digest(),
+                byteorder="big",
+                signed=False,
+            )
+            # hash is positive, since signed is false
+            kwargs = {"rng_seed": hashed_seed}
 
         t = text_transformation(column=feature, **kwargs)
 

diff --git a/python-client/tests/fixtures/drug_classification__multiclass_classification.py b/python-client/tests/fixtures/drug_classification__multiclass_classification.py
@@ -72,7 +72,7 @@ def drug_classification_model(drug_classification_data) -> SKLearnModel:
         steps=[
             ("one_hot_encoder", OneHotEncoder()),
             ("resampler", SMOTE()),
-            ("classifier", SVC(kernel="linear", max_iter=250, probability=True)),
+            ("classifier", SVC(random_state=30, kernel="linear", max_iter=250, probability=True)),
         ]
     )
 

diff --git a/python-client/tests/fixtures/enron_multilabel_classification.py b/python-client/tests/fixtures/enron_multilabel_classification.py
@@ -153,7 +153,7 @@ def enron_model(enron_data) -> SKLearnModel:
             ("text_Mail", text_transformer, "Content"),
         ]
     )
-    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100))])
+    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100, random_state=30))])
 
     Y = enron_data.df["Target"]
     X = enron_data.df.drop(columns="Target")

diff --git a/python-client/tests/fixtures/fraud_detection__binary_classification.py b/python-client/tests/fixtures/fraud_detection__binary_classification.py
@@ -164,7 +164,7 @@ def fraud_detection_model(fraud_detection_train_data: Dataset) -> Model:
     x = fraud_detection_train_data.df.drop(TARGET_COLUMN, axis=1)
     y = fraud_detection_train_data.df[TARGET_COLUMN]
 
-    estimator = LGBMClassifier()
+    estimator = LGBMClassifier(random_state=30)
     estimator.fit(x, y)
 
     wrapped_model = Model(

diff --git a/python-client/tests/fixtures/german_credit_scoring.py b/python-client/tests/fixtures/german_credit_scoring.py
@@ -67,7 +67,7 @@ def german_credit_catboost_raw_model(german_credit_data):
     X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
         X, Y, test_size=0.20, random_state=30, stratify=Y
     )
-    cb = CatBoostClassifier(iterations=2, learning_rate=1, depth=2)
+    cb = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, random_seed=0)
     cb.fit(X_train, Y_train, columns_to_encode)
 
     model_score = cb.score(X_test, Y_test)
@@ -115,7 +115,7 @@ def german_credit_raw_model(german_credit_data):
             ("cat", categorical_transformer, columns_to_encode),
         ]
     )
-    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100))])
+    clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100, random_state=30))])
 
     Y = german_credit_data.df["default"]
     X = german_credit_data.df[german_credit_data.columns].drop(columns="default")

diff --git a/python-client/tests/fixtures/hotel_text__regression.py b/python-client/tests/fixtures/hotel_text__regression.py
@@ -65,7 +65,7 @@ def hotel_text_model(hotel_text_data) -> SKLearnModel:
         steps=[
             ("vectorizer_adapter", FunctionTransformer(adapt_vectorizer_input)),
             ("vectorizer", TfidfVectorizer(max_features=10000)),
-            ("regressor", GradientBoostingRegressor(n_estimators=5)),
+            ("regressor", GradientBoostingRegressor(random_state=30, n_estimators=5)),
         ]
     )
 

diff --git a/python-client/tests/fixtures/medical_transcript_multiclass_classification.py b/python-client/tests/fixtures/medical_transcript_multiclass_classification.py
@@ -93,7 +93,7 @@ def medical_transcript_model(medical_transcript_data: Dataset) -> SKLearnModel:
             ("text_preprocessor", FunctionTransformer(preprocess_text)),
             ("vectorizer_input_adapter", FunctionTransformer(adapt_vectorizer_input)),
             ("vectorizer", CountVectorizer(ngram_range=(1, 1))),
-            ("estimator", RandomForestClassifier(n_estimators=1, max_depth=3)),
+            ("estimator", RandomForestClassifier(n_estimators=10, max_depth=3, random_state=30)),
         ]
     )
 

diff --git a/python-client/tests/fixtures/xboost_classification.py b/python-client/tests/fixtures/xboost_classification.py
@@ -32,7 +32,7 @@ def breast_cancer_model(breast_cancer_data: Dataset) -> Model:
         breast_cancer_data.df[TARGET_COLUMN_NAME],
         random_state=RANDOM_SEED,
     )
-    xgb = XGBClassifier(objective="binary:logistic")
+    xgb = XGBClassifier(objective="binary:logistic", random_state=30)
     xgb.fit(X_train, y_train)
     return Model(
         model=xgb,

diff --git a/python-client/tests/test_push.py b/python-client/tests/test_push.py
@@ -1,6 +1,11 @@
+import sys
+
 import numpy as np
-from giskard.ml_worker.testing.functions.transformation import mad_transformation
+import pandas as pd
+import pytest
 
+import giskard.push
+from giskard.ml_worker.testing.functions.transformation import mad_transformation
 from giskard.ml_worker.testing.registry.giskard_test import GiskardTest
 from giskard.ml_worker.testing.registry.slicing_function import slicing_function
 from giskard.push import Push
@@ -14,130 +19,68 @@
     slice_bounds_quartile,
 )
 from giskard.slicing.slice import QueryBasedSliceFunction
-import pandas as pd
-
-
-# Classification
-def test_instance_if_not_none(german_credit_model, german_credit_data):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_perturbation_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_overconfidence_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_borderline_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-        ]
-        for push in push_list:
-            if push is not None:
-                assert isinstance(push, Push)
 
-
-def test_slicing_function(german_credit_model, german_credit_data):
-    for i in range(50):
-        push = create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]])
-        if push is not None:
-            assert isinstance(push.slicing_function, QueryBasedSliceFunction)
-
-
-def test_test_function(german_credit_model, german_credit_data):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_perturbation_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_overconfidence_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-            create_borderline_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]),
-        ]
-        for push in push_list:
-            if push is not None:
-                for test in push.tests:
-                    assert isinstance(test(), GiskardTest)
-
-
-# Regression
-def test_instance_if_not_none_reg(linear_regression_diabetes, diabetes_dataset_with_target):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_perturbation_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_overconfidence_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_borderline_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-        ]
-        for push in push_list:
-            if push is not None:
-                assert isinstance(push, Push)
-
-
-def test_slicing_function_reg(linear_regression_diabetes, diabetes_dataset_with_target):
-    for i in range(50):
-        push = create_contribution_push(
-            linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-        )
+DATASETS = [
+    pytest.param(("german_credit_model", "german_credit_data", 50), id="German Credit"),
+    pytest.param(("enron_model", "enron_data", 50), id="Enron"),
+    pytest.param(("linear_regression_diabetes", "diabetes_dataset_with_target", 50), id="Diabetes"),
+]
+
+PUSH_TYPES = [
+    pytest.param(("contribution", giskard.push.ContributionPush, create_contribution_push), id="Contribution"),
+    pytest.param(("perturbation", giskard.push.PerturbationPush, create_perturbation_push), id="Perturbation"),
+    pytest.param(("overconfidence", giskard.push.OverconfidencePush, create_overconfidence_push), id="Overconfidence"),
+    pytest.param(("borderline", giskard.push.BorderlinePush, create_borderline_push), id="Borderline"),
+]
+# fmt: off
+EXPECTED_COUNTS = {
+    "german_credit_model" : {
+        "contribution" :[0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1],
+        "perturbation": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "overconfidence": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "borderline": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    },
+    "linear_regression_diabetes": {
+        "contribution" :[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0],
+        "perturbation" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "overconfidence" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "borderline" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+    },
+    "enron_model": {
+        "contribution" :[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "perturbation" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "overconfidence" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+        "borderline" :[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
+    }
+}
+# fmt: on
+
+
+@pytest.mark.parametrize("dataset", DATASETS)
+@pytest.mark.parametrize("push_type", PUSH_TYPES)
+def test_test_function(request, dataset, push_type):
+    model_name, data_name, nb_line = dataset
+    model = request.getfixturevalue(model_name)
+    data = request.getfixturevalue(data_name)
+
+    push_type_name, push_type_class, push_func = push_type
+    if model_name == "enron_model" and push_type_name == "perturbation" and sys.platform == "win32":
+        pytest.skip("This test give different results on windows")
+
+    push_list = []
+    for i in range(nb_line):
+        push = push_func(model, data, data.df.iloc[[i]])
         if push is not None:
-            assert isinstance(push.slicing_function, QueryBasedSliceFunction)
-
-
-def test_test_function_reg(linear_regression_diabetes, diabetes_dataset_with_target):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_perturbation_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_overconfidence_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-            create_borderline_push(
-                linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]]
-            ),
-        ]
-        for push in push_list:
-            if push is not None:
-                for test in push.tests:
-                    assert isinstance(test(), GiskardTest)
-
-
-# Multiclass Classification
-def test_instance_if_not_none_multi(enron_model, enron_data):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_perturbation_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_overconfidence_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_borderline_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-        ]
-        for push in push_list:
-            if push is not None:
-                assert isinstance(push, Push)
-
-
-def test_slicing_function_multi(enron_model, enron_data):
-    for i in range(50):
-        push = create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]])
-        if push is not None:
-            assert isinstance(push.slicing_function, QueryBasedSliceFunction)
-
-
-def test_test_function_multi(enron_model, enron_data):
-    for i in range(50):
-        push_list = [
-            create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_perturbation_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_overconfidence_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-            create_borderline_push(enron_model, enron_data, enron_data.df.iloc[[i]]),
-        ]
-        for push in push_list:
-            if push is not None:
-                for test in push.tests:
-                    assert isinstance(test(), GiskardTest)
+            assert isinstance(push, Push)
+            assert isinstance(push, push_type_class)
+            push_list.append(len(push.tests))
+            assert all([isinstance(test(), GiskardTest) for test in push.tests])
+            if hasattr(push, "slicing_function"):
+                assert isinstance(push.slicing_function, QueryBasedSliceFunction)
+        else:
+            push_list.append(0)
+    print(push_list)
+    assert push_list == EXPECTED_COUNTS[model_name][push_type_name]
 
 
 def test_mad_transformation_mad_precomputed(enron_data):
@@ -214,9 +157,7 @@ def test_coltype_to_supported_perturbation_type():
 
 
 def test_text_explain_in_push(medical_transcript_model, medical_transcript_data):
-
     problematic_df_entry = medical_transcript_data.df.iloc[[3]]
     output = create_contribution_push(medical_transcript_model, medical_transcript_data, problematic_df_entry)
-
     assert output is not None
     assert output.value is not None