Skip to content

Commit

Permalink
Merge branch 'v0-legacy' into develop
Browse files Browse the repository at this point in the history
Conflicts:
	src/sensai/util/pickle.py
	tests/backwardscompat/test_models.py
	tests/conftest.py
  • Loading branch information
opcode81 committed Aug 6, 2023
2 parents a144b9d + 9a0250c commit 4c64303
Show file tree
Hide file tree
Showing 8 changed files with 164 additions and 108 deletions.
8 changes: 6 additions & 2 deletions src/sensai/util/pickle.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import logging
import os
import pickle
from typing import List, Dict, Any, Iterable
from pathlib import Path
from typing import List, Dict, Any, Iterable, Union

import joblib

Expand Down Expand Up @@ -29,7 +30,10 @@ def read_file(f):
return read_file(f)


def dump_pickle(obj, pickle_path, backend="pickle", protocol=pickle.HIGHEST_PROTOCOL):
def dump_pickle(obj, pickle_path: Union[str, Path], backend="pickle", protocol=pickle.HIGHEST_PROTOCOL):
if isinstance(pickle_path, Path):
pickle_path = str(pickle_path)

def open_file():
if is_s3_path(pickle_path):
return S3Object(pickle_path).open_file("wb")
Expand Down
58 changes: 58 additions & 0 deletions tests/backwardscompat/create_test_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
import sys

from sensai.data_transformation import DFTNormalisation, DFTOneHotEncoder, SkLearnTransformerFactoryFactory
from sensai.featuregen import FeatureCollector, FeatureGeneratorTakeColumns
from sensai.sklearn.sklearn_regression import SkLearnLinearRegressionVectorRegressionModel, SkLearnRandomForestVectorRegressionModel, \
SkLearnMultiLayerPerceptronVectorRegressionModel
from sensai.util import logging
from sensai.util.pickle import dump_pickle
from tests.model_test_case import DiabetesDataSet, RegressionTestCase, RESOURCE_PATH


def create_regression_models_for_backward_compatibility_test(version):
"""
:param version: version with which the files are created
"""
dataset = DiabetesDataSet()

fc = FeatureCollector(
FeatureGeneratorTakeColumns(dataset.categorical_features, categorical_feature_names=dataset.categorical_features,
normalisation_rule_template=DFTNormalisation.RuleTemplate(unsupported=True)),
FeatureGeneratorTakeColumns(dataset.numeric_features,
normalisation_rule_template=DFTNormalisation.RuleTemplate(independent_columns=True)))

modelLinear = SkLearnLinearRegressionVectorRegressionModel() \
.with_feature_collector(fc) \
.with_feature_transformers(
DFTOneHotEncoder(fc.get_categorical_feature_name_regex()),
DFTNormalisation(fc.get_normalisation_rules(), default_transformer_factory=SkLearnTransformerFactoryFactory.RobustScaler())) \
.with_name("Linear")

modelRF = SkLearnRandomForestVectorRegressionModel(n_estimators=10, min_samples_leaf=10) \
.with_feature_collector(fc) \
.with_feature_transformers(DFTOneHotEncoder(fc.get_categorical_feature_name_regex())) \
.with_name("RandomForest")

modelMLP = SkLearnMultiLayerPerceptronVectorRegressionModel(hidden_layer_sizes=(20,20), solver="adam", max_iter=1000, batch_size=32, early_stopping=True) \
.with_feature_collector(fc) \
.with_feature_transformers(
DFTOneHotEncoder(fc.get_categorical_feature_name_regex()),
DFTNormalisation(fc.get_normalisation_rules(), default_transformer_factory=SkLearnTransformerFactoryFactory.RobustScaler())) \
.with_name("SkLearnMLP")

testCase = RegressionTestCase(dataset.getInputOutputData())
ev = testCase.createEvaluator()
for model in [modelLinear, modelRF, modelMLP]:
ev.fit_model(model)
eval_data = ev.eval_model(model)
eval_stats = eval_data.get_eval_stats()
print(eval_stats)
r2 = eval_stats.getR2()
persisted_data = {"R2": r2, "model": model}
dump_pickle(persisted_data, RESOURCE_PATH / "backward_compatibility" / f"regression_model_{model.get_name()}.{version}.pickle")


if __name__ == '__main__':
logging.configure()
sys.path.append("../..")
create_regression_models_for_backward_compatibility_test("v0.2.0")
46 changes: 14 additions & 32 deletions tests/backwardscompat/test_models.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,28 @@
import os
from glob import glob

import pytest

import sensai
from sensai import VectorModel
from sensai.data_transformation import DFTNormalisation, SkLearnTransformerFactoryFactory, DFTOneHotEncoder
from sensai.featuregen import FeatureGeneratorTakeColumns, FeatureCollector
from sensai.sklearn.sklearn_regression import SkLearnLinearRegressionVectorRegressionModel, SkLearnRandomForestVectorRegressionModel, \
SkLearnMultiLayerPerceptronVectorRegressionModel
from sensai.util.pickle import load_pickle
from tests.conftest import RESOURCE_DIR


def test_modelCanBeLoaded(testResources, irisClassificationTestCase):
def test_classification_model_backward_compatibility_v0_0_4(testResources, irisClassificationTestCase):
# The model file was generated with tests/frameworks/torch/test_torch.test_MLPClassifier at commit f93c6b11d
modelPath = os.path.join(testResources, "torch_mlp.pickle")
model = VectorModel.load(modelPath)
assert isinstance(model, sensai.torch.models.MultiLayerPerceptronVectorClassificationModel)
irisClassificationTestCase.testMinAccuracy(model, 0.8, fit=False)


# TODO
def createRegressionModelsForBackwardsCompatibilityTest(testCase):
fc = FeatureCollector(FeatureGeneratorTakeColumns(categoricalFeatureNames=["SEX"],
normalisationRuleTemplate=DFTNormalisation.RuleTemplate(independentColumns=False)))

modelLinear = SkLearnLinearRegressionVectorRegressionModel() \
.withFeatureCollector(fc) \
.withFeatureTransformers(
DFTOneHotEncoder(fc.getCategoricalFeatureNameRegex()))
#DFTNormalisation(fc.getNormalisationRules(), defaultTransformerFactory=SkLearnTransformerFactoryFactory.RobustScaler()))

modelRF = SkLearnRandomForestVectorRegressionModel() \
.withFeatureCollector(fc) \
.withFeatureTransformers(DFTOneHotEncoder(fc.getCategoricalFeatureNameRegex()))

modelMLP = SkLearnMultiLayerPerceptronVectorRegressionModel(hidden_layer_sizes=(10, 10), solver="lbfgs") \
.withFeatureCollector(fc) \
.withFeatureTransformers(
DFTOneHotEncoder(fc.getCategoricalFeatureNameRegex()),
DFTNormalisation(fc.getNormalisationRules(), defaultTransformerFactory=SkLearnTransformerFactoryFactory.RobustScaler()))

return modelMLP

@pytest.mark.parametrize("pickle_file", glob(f"{RESOURCE_DIR}/backward_compatibility/regression_model_*.v0.2.0.pickle"))
def test_regression_model_backward_compatibility_v0_2_0(pickle_file, diabetesRegressionTestCase):
"""
Tests for compatibility with models created with v0.2.0 using create_test_models.py
"""
d = load_pickle(pickle_file)
r2, model = d["R2"], d["model"]
diabetesRegressionTestCase.testMinR2(model, r2-0.02, fit=False)

# TODO
def todo_test_backward_compatibility_v020(diabetesRegressionTestCase):
model = createRegressionModelsForBackwardsCompatibilityTest(diabetesRegressionTestCase)
diabetesRegressionTestCase.testMinR2(model, 0.5, fit=True)
75 changes: 1 addition & 74 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,58 +2,20 @@
import os
import sys

import pandas as pd
import pytest
from sklearn.datasets import load_iris

from sensai import InputOutputData, VectorClassificationModel, VectorRegressionModel
from sensai.evaluation import VectorClassificationModelEvaluator, VectorRegressionModelEvaluator, VectorRegressionModelEvaluatorParams, \
VectorClassificationModelEvaluatorParams
from model_test_case import IrisDataSet, ClassificationTestCase, RegressionTestCase, DiabetesDataSet, RESOURCE_DIR

sys.path.append(os.path.abspath("."))
from config import topLevelDirectory

log = logging.getLogger(__name__)


RESOURCE_DIR = os.path.join(topLevelDirectory, "tests", "resources")


@pytest.fixture(scope="session")
def testResources():
return RESOURCE_DIR


class IrisDataSet:
_iod = None

@classmethod
def getInputOutputData(cls):
if cls._iod is None:
d = load_iris()
inputs = pd.DataFrame(d["data"], columns=d["feature_names"])
targetNames = d["target_names"]
outputs = pd.DataFrame({"class": [targetNames[x] for x in d["target"]]})
cls._iod = InputOutputData(inputs, outputs)
return cls._iod


class ClassificationTestCase:
def __init__(self, data: InputOutputData):
self.data = data

def testMinAccuracy(self, model: VectorClassificationModel, minAccuracy: float, fit=True):
params = VectorClassificationModelEvaluatorParams(fractional_split_test_fraction=0.2)
ev = VectorClassificationModelEvaluator(self.data, params=params)
if fit:
ev.fit_model(model)
resultData = ev.eval_model(model)
stats = resultData.get_eval_stats()
#stats.plotConfusionMatrix().savefig("cmat.png")
log.info(f"Results for {model.get_name()}: {stats}")
assert stats.get_accuracy() >= minAccuracy


@pytest.fixture(scope="session")
def irisDataSet():
return IrisDataSet()
Expand All @@ -64,41 +26,6 @@ def irisClassificationTestCase(irisDataSet):
return ClassificationTestCase(irisDataSet.getInputOutputData())


class RegressionTestCase:
def __init__(self, data: InputOutputData):
self.data = data
self.evaluatorParams = VectorRegressionModelEvaluatorParams(fractional_split_test_fraction=0.2, fractional_split_shuffle=True,
fractional_split_random_seed=42)

def testMinR2(self, model: VectorRegressionModel, minR2: float, fit=True):
ev = VectorRegressionModelEvaluator(self.data, params=self.evaluatorParams)
if fit:
ev.fit_model(model)
resultData = ev.eval_model(model)
stats = resultData.get_eval_stats()

#stats.plotScatterGroundTruthPredictions()
#from matplotlib import pyplot as plt; plt.show()
#resultDataTrain = ev.evalModel(model, onTrainingData=True); log.info(f"on train: {resultDataTrain.getEvalStats()}")

log.info(f"Results for {model.get_name()}: {stats}")
assert stats.get_r2() >= minR2


class DiabetesDataSet:
"""
Classic diabetes data set (downloaded from https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt)
"""
_iod = None

@classmethod
def getInputOutputData(cls):
if cls._iod is None:
df = pd.read_csv(os.path.join(RESOURCE_DIR, "diabetes.tab.txt"), sep="\t")
return InputOutputData.from_data_frame(df, "Y")
return cls._iod


@pytest.fixture(scope="session")
def diabetesDataSet():
return DiabetesDataSet()
Expand Down
85 changes: 85 additions & 0 deletions tests/model_test_case.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
import os
from pathlib import Path

import pandas as pd
from sklearn.datasets import load_iris

from sensai import InputOutputData, VectorClassificationModel, VectorRegressionModel
from sensai.evaluation import VectorClassificationModelEvaluator, VectorRegressionModelEvaluatorParams, VectorRegressionModelEvaluator, \
VectorClassificationModelEvaluatorParams
from sensai.util import logging

log = logging.getLogger(__name__)
RESOURCE_PATH = Path(__file__).resolve().parent / "resources"
RESOURCE_DIR = str(RESOURCE_PATH)


class IrisDataSet:
_iod = None

@classmethod
def getInputOutputData(cls):
if cls._iod is None:
d = load_iris()
inputs = pd.DataFrame(d["data"], columns=d["feature_names"])
targetNames = d["target_names"]
outputs = pd.DataFrame({"class": [targetNames[x] for x in d["target"]]})
cls._iod = InputOutputData(inputs, outputs)
return cls._iod


class ClassificationTestCase:
def __init__(self, data: InputOutputData):
self.data = data

def testMinAccuracy(self, model: VectorClassificationModel, minAccuracy: float, fit=True):
ev = VectorClassificationModelEvaluator(self.data,
params=VectorClassificationModelEvaluatorParams(fractional_split_test_fraction=0.2, fractional_split_random_seed=42,
fractional_split_shuffle=True))
if fit:
ev.fit_model(model)
resultData = ev.eval_model(model)
stats = resultData.get_eval_stats()
#stats.plotConfusionMatrix().savefig("cmat.png")
log.info(f"Results for {model.get_name()}: {stats}")
assert stats.get_accuracy() >= minAccuracy


class RegressionTestCase:
def __init__(self, data: InputOutputData):
self.data = data
self.evaluatorParams = VectorRegressionModelEvaluatorParams(fractional_split_test_fraction=0.2, fractional_split_shuffle=True,
fractional_split_random_seed=42)

def createEvaluator(self) -> VectorRegressionModelEvaluator:
return VectorRegressionModelEvaluator(self.data, params=self.evaluatorParams)

def testMinR2(self, model: VectorRegressionModel, minR2: float, fit=True):
ev = self.createEvaluator()
if fit:
ev.fit_model(model)
resultData = ev.eval_model(model)
stats = resultData.get_eval_stats()

#stats.plotScatterGroundTruthPredictions()
#from matplotlib import pyplot as plt; plt.show()
#resultDataTrain = ev.evalModel(model, onTrainingData=True); log.info(f"on train: {resultDataTrain.getEvalStats()}")

log.info(f"Results for {model.get_name()}: {stats}")
assert stats.get_r2() >= minR2


class DiabetesDataSet:
"""
Classic diabetes data set (downloaded from https://www4.stat.ncsu.edu/~boos/var.select/diabetes.tab.txt)
"""
_iod = None
categorical_features = ["SEX"]
numeric_features = ["AGE", "BMI", "BP", "S1", "S2", "S3", "S4", "S5", "S6"]

@classmethod
def getInputOutputData(cls):
if cls._iod is None:
df = pd.read_csv(os.path.join(RESOURCE_DIR, "diabetes.tab.txt"), sep="\t")
return InputOutputData.from_data_frame(df, "Y")
return cls._iod
Binary file not shown.
Binary file not shown.
Binary file not shown.

0 comments on commit 4c64303

Please sign in to comment.