diff --git a/python-client/giskard/push/perturbation.py b/python-client/giskard/push/perturbation.py index bca9838d4d..4c66099f36 100644 --- a/python-client/giskard/push/perturbation.py +++ b/python-client/giskard/push/perturbation.py @@ -10,9 +10,10 @@ """ from typing import Optional +import hashlib + import numpy as np import pandas as pd -import sys from giskard.core.core import SupportedModelTypes from giskard.datasets.base import Dataset @@ -31,6 +32,7 @@ TextTypoTransformation, TextUppercase, ) + from ..push import PerturbationPush text_transformation_list = [ @@ -173,10 +175,16 @@ def _text( # TextTypoTransformation generates a random typo for text features. In order to have the same typo per # sample with the push feature in the debugger, we need to generate a unique seed per sample (hashed_seed) # to guarantee the same perturbation per sample. - hashed_seed = hash(f"{', '.join(map(lambda x: repr(x), ds_slice_copy.df.values))}".encode("utf-8")) - # hash could give negative ints, and np.random.seed accepts only positive ints - positive_hashed_seed = hashed_seed % ((sys.maxsize + 1) * 2) - kwargs = {"rng_seed": positive_hashed_seed} + # SHA1 is used here, since it does not matter that there are collisions + hashed_seed = int.from_bytes( + hashlib.sha1( + (f"{', '.join(map(lambda x: repr(x), ds_slice_copy.df.values))}".encode("utf-8")) + ).digest(), + byteorder="big", + signed=False, + ) + # hash is positive, since signed is false + kwargs = {"rng_seed": hashed_seed} t = text_transformation(column=feature, **kwargs) diff --git a/python-client/tests/fixtures/drug_classification__multiclass_classification.py b/python-client/tests/fixtures/drug_classification__multiclass_classification.py index 31936d5cf6..8765ef2955 100644 --- a/python-client/tests/fixtures/drug_classification__multiclass_classification.py +++ b/python-client/tests/fixtures/drug_classification__multiclass_classification.py @@ -72,7 +72,7 @@ def drug_classification_model(drug_classification_data) -> SKLearnModel: steps=[ ("one_hot_encoder", OneHotEncoder()), ("resampler", SMOTE()), - ("classifier", SVC(kernel="linear", max_iter=250, probability=True)), + ("classifier", SVC(random_state=30, kernel="linear", max_iter=250, probability=True)), ] ) diff --git a/python-client/tests/fixtures/enron_multilabel_classification.py b/python-client/tests/fixtures/enron_multilabel_classification.py index 9f918d80d9..2b9e9790a3 100644 --- a/python-client/tests/fixtures/enron_multilabel_classification.py +++ b/python-client/tests/fixtures/enron_multilabel_classification.py @@ -153,7 +153,7 @@ def enron_model(enron_data) -> SKLearnModel: ("text_Mail", text_transformer, "Content"), ] ) - clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100))]) + clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100, random_state=30))]) Y = enron_data.df["Target"] X = enron_data.df.drop(columns="Target") diff --git a/python-client/tests/fixtures/fraud_detection__binary_classification.py b/python-client/tests/fixtures/fraud_detection__binary_classification.py index 5e06028f06..aff46911e3 100644 --- a/python-client/tests/fixtures/fraud_detection__binary_classification.py +++ b/python-client/tests/fixtures/fraud_detection__binary_classification.py @@ -164,7 +164,7 @@ def fraud_detection_model(fraud_detection_train_data: Dataset) -> Model: x = fraud_detection_train_data.df.drop(TARGET_COLUMN, axis=1) y = fraud_detection_train_data.df[TARGET_COLUMN] - estimator = LGBMClassifier() + estimator = LGBMClassifier(random_state=30) estimator.fit(x, y) wrapped_model = Model( diff --git a/python-client/tests/fixtures/german_credit_scoring.py b/python-client/tests/fixtures/german_credit_scoring.py index 61092f0c2b..9036386959 100644 --- a/python-client/tests/fixtures/german_credit_scoring.py +++ b/python-client/tests/fixtures/german_credit_scoring.py @@ -67,7 +67,7 @@ def german_credit_catboost_raw_model(german_credit_data): X_train, X_test, Y_train, Y_test = model_selection.train_test_split( X, Y, test_size=0.20, random_state=30, stratify=Y ) - cb = CatBoostClassifier(iterations=2, learning_rate=1, depth=2) + cb = CatBoostClassifier(iterations=2, learning_rate=1, depth=2, random_seed=0) cb.fit(X_train, Y_train, columns_to_encode) model_score = cb.score(X_test, Y_test) @@ -115,7 +115,7 @@ def german_credit_raw_model(german_credit_data): ("cat", categorical_transformer, columns_to_encode), ] ) - clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100))]) + clf = Pipeline(steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression(max_iter=100, random_state=30))]) Y = german_credit_data.df["default"] X = german_credit_data.df[german_credit_data.columns].drop(columns="default") diff --git a/python-client/tests/fixtures/hotel_text__regression.py b/python-client/tests/fixtures/hotel_text__regression.py index 479656b74a..4961981707 100644 --- a/python-client/tests/fixtures/hotel_text__regression.py +++ b/python-client/tests/fixtures/hotel_text__regression.py @@ -65,7 +65,7 @@ def hotel_text_model(hotel_text_data) -> SKLearnModel: steps=[ ("vectorizer_adapter", FunctionTransformer(adapt_vectorizer_input)), ("vectorizer", TfidfVectorizer(max_features=10000)), - ("regressor", GradientBoostingRegressor(n_estimators=5)), + ("regressor", GradientBoostingRegressor(random_state=30, n_estimators=5)), ] ) diff --git a/python-client/tests/fixtures/medical_transcript_multiclass_classification.py b/python-client/tests/fixtures/medical_transcript_multiclass_classification.py index 7816ed2e63..65151f65b6 100644 --- a/python-client/tests/fixtures/medical_transcript_multiclass_classification.py +++ b/python-client/tests/fixtures/medical_transcript_multiclass_classification.py @@ -93,7 +93,7 @@ def medical_transcript_model(medical_transcript_data: Dataset) -> SKLearnModel: ("text_preprocessor", FunctionTransformer(preprocess_text)), ("vectorizer_input_adapter", FunctionTransformer(adapt_vectorizer_input)), ("vectorizer", CountVectorizer(ngram_range=(1, 1))), - ("estimator", RandomForestClassifier(n_estimators=1, max_depth=3)), + ("estimator", RandomForestClassifier(n_estimators=10, max_depth=3, random_state=30)), ] ) diff --git a/python-client/tests/fixtures/xboost_classification.py b/python-client/tests/fixtures/xboost_classification.py index 91d4a9f36a..316394f1d6 100644 --- a/python-client/tests/fixtures/xboost_classification.py +++ b/python-client/tests/fixtures/xboost_classification.py @@ -32,7 +32,7 @@ def breast_cancer_model(breast_cancer_data: Dataset) -> Model: breast_cancer_data.df[TARGET_COLUMN_NAME], random_state=RANDOM_SEED, ) - xgb = XGBClassifier(objective="binary:logistic") + xgb = XGBClassifier(objective="binary:logistic", random_state=30) xgb.fit(X_train, y_train) return Model( model=xgb, diff --git a/python-client/tests/test_push.py b/python-client/tests/test_push.py index 03bceca4dd..1b3a77fba9 100644 --- a/python-client/tests/test_push.py +++ b/python-client/tests/test_push.py @@ -1,6 +1,11 @@ +import sys + import numpy as np -from giskard.ml_worker.testing.functions.transformation import mad_transformation +import pandas as pd +import pytest +import giskard.push +from giskard.ml_worker.testing.functions.transformation import mad_transformation from giskard.ml_worker.testing.registry.giskard_test import GiskardTest from giskard.ml_worker.testing.registry.slicing_function import slicing_function from giskard.push import Push @@ -14,130 +19,68 @@ slice_bounds_quartile, ) from giskard.slicing.slice import QueryBasedSliceFunction -import pandas as pd - - -# Classification -def test_instance_if_not_none(german_credit_model, german_credit_data): - for i in range(50): - push_list = [ - create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_perturbation_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_overconfidence_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_borderline_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - ] - for push in push_list: - if push is not None: - assert isinstance(push, Push) - -def test_slicing_function(german_credit_model, german_credit_data): - for i in range(50): - push = create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]) - if push is not None: - assert isinstance(push.slicing_function, QueryBasedSliceFunction) - - -def test_test_function(german_credit_model, german_credit_data): - for i in range(50): - push_list = [ - create_contribution_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_perturbation_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_overconfidence_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - create_borderline_push(german_credit_model, german_credit_data, german_credit_data.df.iloc[[i]]), - ] - for push in push_list: - if push is not None: - for test in push.tests: - assert isinstance(test(), GiskardTest) - - -# Regression -def test_instance_if_not_none_reg(linear_regression_diabetes, diabetes_dataset_with_target): - for i in range(50): - push_list = [ - create_contribution_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_perturbation_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_overconfidence_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_borderline_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - ] - for push in push_list: - if push is not None: - assert isinstance(push, Push) - - -def test_slicing_function_reg(linear_regression_diabetes, diabetes_dataset_with_target): - for i in range(50): - push = create_contribution_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ) +DATASETS = [ + pytest.param(("german_credit_model", "german_credit_data", 50), id="German Credit"), + pytest.param(("enron_model", "enron_data", 50), id="Enron"), + pytest.param(("linear_regression_diabetes", "diabetes_dataset_with_target", 50), id="Diabetes"), +] + +PUSH_TYPES = [ + pytest.param(("contribution", giskard.push.ContributionPush, create_contribution_push), id="Contribution"), + pytest.param(("perturbation", giskard.push.PerturbationPush, create_perturbation_push), id="Perturbation"), + pytest.param(("overconfidence", giskard.push.OverconfidencePush, create_overconfidence_push), id="Overconfidence"), + pytest.param(("borderline", giskard.push.BorderlinePush, create_borderline_push), id="Borderline"), +] +# fmt: off +EXPECTED_COUNTS = { + "german_credit_model" : { + "contribution" :[0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1], + "perturbation": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "overconfidence": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "borderline": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + }, + "linear_regression_diabetes": { + "contribution" :[0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0], + "perturbation" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "overconfidence" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "borderline" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + }, + "enron_model": { + "contribution" :[0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "perturbation" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "overconfidence" :[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "borderline" :[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0], + } +} +# fmt: on + + +@pytest.mark.parametrize("dataset", DATASETS) +@pytest.mark.parametrize("push_type", PUSH_TYPES) +def test_test_function(request, dataset, push_type): + model_name, data_name, nb_line = dataset + model = request.getfixturevalue(model_name) + data = request.getfixturevalue(data_name) + + push_type_name, push_type_class, push_func = push_type + if model_name == "enron_model" and push_type_name == "perturbation" and sys.platform == "win32": + pytest.skip("This test give different results on windows") + + push_list = [] + for i in range(nb_line): + push = push_func(model, data, data.df.iloc[[i]]) if push is not None: - assert isinstance(push.slicing_function, QueryBasedSliceFunction) - - -def test_test_function_reg(linear_regression_diabetes, diabetes_dataset_with_target): - for i in range(50): - push_list = [ - create_contribution_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_perturbation_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_overconfidence_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - create_borderline_push( - linear_regression_diabetes, diabetes_dataset_with_target, diabetes_dataset_with_target.df.iloc[[i]] - ), - ] - for push in push_list: - if push is not None: - for test in push.tests: - assert isinstance(test(), GiskardTest) - - -# Multiclass Classification -def test_instance_if_not_none_multi(enron_model, enron_data): - for i in range(50): - push_list = [ - create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_perturbation_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_overconfidence_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_borderline_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - ] - for push in push_list: - if push is not None: - assert isinstance(push, Push) - - -def test_slicing_function_multi(enron_model, enron_data): - for i in range(50): - push = create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]]) - if push is not None: - assert isinstance(push.slicing_function, QueryBasedSliceFunction) - - -def test_test_function_multi(enron_model, enron_data): - for i in range(50): - push_list = [ - create_contribution_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_perturbation_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_overconfidence_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - create_borderline_push(enron_model, enron_data, enron_data.df.iloc[[i]]), - ] - for push in push_list: - if push is not None: - for test in push.tests: - assert isinstance(test(), GiskardTest) + assert isinstance(push, Push) + assert isinstance(push, push_type_class) + push_list.append(len(push.tests)) + assert all([isinstance(test(), GiskardTest) for test in push.tests]) + if hasattr(push, "slicing_function"): + assert isinstance(push.slicing_function, QueryBasedSliceFunction) + else: + push_list.append(0) + print(push_list) + assert push_list == EXPECTED_COUNTS[model_name][push_type_name] def test_mad_transformation_mad_precomputed(enron_data): @@ -214,9 +157,7 @@ def test_coltype_to_supported_perturbation_type(): def test_text_explain_in_push(medical_transcript_model, medical_transcript_data): - problematic_df_entry = medical_transcript_data.df.iloc[[3]] output = create_contribution_push(medical_transcript_model, medical_transcript_data, problematic_df_entry) - assert output is not None assert output.value is not None