Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: easily create a baseline model #811

Merged
merged 32 commits into from
Jun 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ebcb811
add basic implementation of BaselineClassifier
sibre28 May 24, 2024
970021c
add basic implementation of BaselineRegressor
sibre28 May 24, 2024
1e78d18
change stuff to terminate execution after max 30 sec
sibre28 May 25, 2024
608f1f6
Add data validation
sibre28 May 31, 2024
91db191
Add tests
sibre28 May 31, 2024
eab2a39
Linter fixes
sibre28 Jun 1, 2024
4b6edd7
Linter fixes
sibre28 Jun 1, 2024
ab6296e
add docs
sibre28 Jun 1, 2024
1f6347c
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 1, 2024
a6e3061
linter fixes
sibre28 Jun 1, 2024
f6b1002
linter fixes
sibre28 Jun 1, 2024
6af2d0b
linter fixes
sibre28 Jun 1, 2024
a1b4153
style: apply automated linter fixes
megalinter-bot Jun 1, 2024
6baff71
Add DatasetMissesTargetError
sibre28 Jun 1, 2024
96f6dee
Merge remote-tracking branch 'origin/710-easily-create-a-baseline-mod…
sibre28 Jun 1, 2024
da67f73
add docs
sibre28 Jun 1, 2024
8e12811
linter fix
sibre28 Jun 1, 2024
a62c617
style: apply automated linter fixes
megalinter-bot Jun 1, 2024
310513b
fix model tests
sibre28 Jun 1, 2024
967430a
Merge remote-tracking branch 'origin/710-easily-create-a-baseline-mod…
sibre28 Jun 1, 2024
b96fbf3
fix codecov
sibre28 Jun 2, 2024
8a15d45
style: apply automated linter fixes
megalinter-bot Jun 2, 2024
e659d24
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 2, 2024
af06e73
rename DatasetMissesTargetError to TargetDataMismatchError
sibre28 Jun 3, 2024
faab0a8
style: apply automated linter fixes
megalinter-bot Jun 3, 2024
abab90b
style: apply automated linter fixes
megalinter-bot Jun 3, 2024
a95a179
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 4, 2024
b4b120e
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 11, 2024
e7579a0
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 18, 2024
d5150c6
Merge branch 'main' into 710-easily-create-a-baseline-model
sibre28 Jun 25, 2024
b8229f6
remove todos
sibre28 Jun 25, 2024
aa0abf2
Merge remote-tracking branch 'origin/710-easily-create-a-baseline-mod…
sibre28 Jun 25, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/safeds/exceptions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
ModelNotFittedError,
PlainTableError,
PredictionError,
TargetDataMismatchError,
)


Expand Down Expand Up @@ -69,6 +70,7 @@ class OutOfBoundsError(SafeDsError):
# ML exceptions
"DatasetMissesDataError",
"DatasetMissesFeaturesError",
"TargetDataMismatchError",
"FeatureDataMismatchError",
"InvalidFitDataError",
"InputSizeError",
Expand Down
26 changes: 23 additions & 3 deletions src/safeds/exceptions/_ml.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,26 @@ def __init__(self, missing_feature_names: list[str]):
super().__init__(f"Dataset misses the feature columns '{missing_feature_names}'.")


class TargetDataMismatchError(ValueError):
"""
Raised when the target column of a test dataset mismatches with the target column of the training dataset.

Currently only used in the Baseline Models.

Parameters
----------
actual_target_name:
The actual target column of the dataset.
missing_target_name:
The name of the missing target column.
"""

def __init__(self, actual_target_name: str, missing_target_name: str):
super().__init__(
f"The provided target column '{actual_target_name}' does not match the target column of the training set '{missing_target_name}'.",
)


class DatasetMissesDataError(ValueError):
"""Raised when a dataset contains no rows."""

Expand Down Expand Up @@ -72,16 +92,16 @@ def __init__(self, reason: str):


class FeatureDataMismatchError(Exception):
"""Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the neural network."""
"""Raised when the columns of the table passed to the predict or fit method do not match with the specified features of the model."""

def __init__(self) -> None:
super().__init__(
"The features in the given table do not match with the specified feature columns names of the neural network.",
"The features in the given table do not match with the specified feature columns names of the model.",
)


class InputSizeError(Exception):
"""Raised when the amount of features being passed to a network does not match with its input size."""
"""Raised when the amount of features being passed to a model does not match with its input size."""

def __init__(self, data_size: int | ModelImageSize, input_layer_size: int | ModelImageSize | None) -> None:
# TODO: remove input_layer_size type None again
Expand Down
3 changes: 3 additions & 0 deletions src/safeds/ml/classical/classification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

if TYPE_CHECKING:
from ._ada_boost_classifier import AdaBoostClassifier
from ._baseline_classifier import BaselineClassifier
from ._classifier import Classifier
from ._decision_tree_classifier import DecisionTreeClassifier
from ._gradient_boosting_classifier import GradientBoostingClassifier
Expand All @@ -18,6 +19,7 @@
__name__,
{
"AdaBoostClassifier": "._ada_boost_classifier:AdaBoostClassifier",
"BaselineClassifier": "._baseline_classifier:BaselineClassifier",
"Classifier": "._classifier:Classifier",
"DecisionTreeClassifier": "._decision_tree_classifier:DecisionTreeClassifier",
"GradientBoostingClassifier": "._gradient_boosting_classifier:GradientBoostingClassifier",
Expand All @@ -30,6 +32,7 @@

__all__ = [
"AdaBoostClassifier",
"BaselineClassifier",
"Classifier",
"DecisionTreeClassifier",
"GradientBoostingClassifier",
Expand Down
188 changes: 188 additions & 0 deletions src/safeds/ml/classical/classification/_baseline_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
import copy
from concurrent.futures import ALL_COMPLETED, wait
from typing import Self

from safeds._validation._check_columns_are_numeric import _check_columns_are_numeric
from safeds.data.labeled.containers import TabularDataset
from safeds.exceptions import (
DatasetMissesDataError,
FeatureDataMismatchError,
ModelNotFittedError,
TargetDataMismatchError,
)
from safeds.ml.classical.classification import (
AdaBoostClassifier,
Classifier,
DecisionTreeClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
SupportVectorClassifier,
)


def _fit_single_model(model: Classifier, train_data: TabularDataset) -> Classifier:
return model.fit(train_data) # pragma: no cover


def _predict_single_model(model: Classifier, test_data: TabularDataset) -> TabularDataset:
return model.predict(test_data) # pragma: no cover


class BaselineClassifier:
Gerhardsa0 marked this conversation as resolved.
Show resolved Hide resolved
"""
Baseline Classifier.

Get a baseline by fitting data on multiple different models and comparing the best metrics.

Parameters ---------- extended_search: If set to true, an extended set of models will be used to fit the
classifier. This might result in significantly higher runtime.
"""

def __init__(self, extended_search: bool = False):
self._is_fitted = False
self._list_of_model_types = [
AdaBoostClassifier(),
DecisionTreeClassifier(),
SupportVectorClassifier(),
RandomForestClassifier(),
]
if extended_search:
self._list_of_model_types.extend([GradientBoostingClassifier()]) # pragma: no cover

self._fitted_models: list[Classifier] = []
self._feature_names: list[str] | None = None
self._target_name: str = "none"

def fit(self, train_data: TabularDataset) -> Self:
"""
Train the Classifier with given training data.

The original model is not modified.

Parameters
----------
train_data:
The data the network should be trained on.

Returns
-------
trained_classifier:
The trained Classifier

Raises
------
DatasetMissesDataError
If the given train_data contains no data.
ColumnTypeError
If one or more columns contain non-numeric values.
"""
from concurrent.futures import ProcessPoolExecutor

# Validate Data
train_data_as_table = train_data.to_table()
if train_data_as_table.row_count == 0:
raise DatasetMissesDataError
_check_columns_are_numeric(train_data_as_table, train_data.features.add_columns(train_data.target).column_names)

copied_model = copy.deepcopy(self)

with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
futures = []
for model in self._list_of_model_types:
futures.append(executor.submit(_fit_single_model, model, train_data))
[done, _] = wait(futures, return_when=ALL_COMPLETED)
for future in done:
copied_model._fitted_models.append(future.result())
executor.shutdown()

copied_model._is_fitted = True
copied_model._feature_names = train_data.features.column_names
copied_model._target_name = train_data.target.name
return copied_model

def predict(self, test_data: TabularDataset) -> dict[str, float]:
"""
Make a prediction for the given test data and calculate the best metrics.

The original Model is not modified.

Parameters
----------
test_data:
The data the Classifier should predict.

Returns
-------
best_metrics:
A dictionary with the best metrics that were achieved.

Raises
------
ModelNotFittedError
If the model has not been fitted yet
FeatureDataMismatchError
If the features of the test data do not match with the features of the trained Classifier.
DatasetMissesDataError
If the given test_data contains no data.
TargetDataMismatchError
If the target column of the test data does not match the target column of the training data.
ColumnTypeError
If one or more columns contain non-numeric values.
"""
from concurrent.futures import ProcessPoolExecutor

from safeds.ml.metrics import ClassificationMetrics

if not self._is_fitted:
raise ModelNotFittedError

# Validate data
if not self._feature_names == test_data.features.column_names:
raise FeatureDataMismatchError
if not self._target_name == test_data.target.name:
raise TargetDataMismatchError(
actual_target_name=test_data.target.name,
missing_target_name=self._target_name,
)
test_data_as_table = test_data.to_table()
if test_data_as_table.row_count == 0:
raise DatasetMissesDataError
_check_columns_are_numeric(test_data_as_table, test_data.features.add_columns(test_data.target).column_names)

with ProcessPoolExecutor(max_workers=len(self._list_of_model_types)) as executor:
results = []
futures = []
for model in self._fitted_models:
futures.append(executor.submit(_predict_single_model, model, test_data))
[done, _] = wait(futures, return_when=ALL_COMPLETED)
for future in done:
results.append(future.result())
executor.shutdown()

max_metrics = {"accuracy": 0.0, "f1score": 0.0, "precision": 0.0, "recall": 0.0}
for result in results:
accuracy = ClassificationMetrics.accuracy(result, test_data)

positive_class = test_data.target.get_value(0)
f1score = ClassificationMetrics.f1_score(result, test_data, positive_class)
precision = ClassificationMetrics.precision(result, test_data, positive_class)
recall = ClassificationMetrics.recall(result, test_data, positive_class)

if max_metrics.get("accuracy", 0.0) < accuracy:
max_metrics.update({"accuracy": accuracy})

if max_metrics.get("f1score", 0.0) < f1score:
max_metrics.update({"f1score": f1score})

if max_metrics.get("precision", 0.0) < precision:
max_metrics.update({"precision": precision})

if max_metrics.get("recall", 0.0) < recall:
max_metrics.update({"recall": recall})

return max_metrics

@property
def is_fitted(self) -> bool:
"""Whether the model is fitted."""
return self._is_fitted
3 changes: 3 additions & 0 deletions src/safeds/ml/classical/regression/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
if TYPE_CHECKING:
from ._ada_boost_regressor import AdaBoostRegressor
from ._arima import ArimaModelRegressor
from ._baseline_regressor import BaselineRegressor
from ._decision_tree_regressor import DecisionTreeRegressor
from ._elastic_net_regressor import ElasticNetRegressor
from ._gradient_boosting_regressor import GradientBoostingRegressor
Expand All @@ -23,6 +24,7 @@
{
"AdaBoostRegressor": "._ada_boost_regressor:AdaBoostRegressor",
"ArimaModelRegressor": "._arima:ArimaModelRegressor",
"BaselineRegressor": "._baseline_regressor:BaselineRegressor",
"DecisionTreeRegressor": "._decision_tree_regressor:DecisionTreeRegressor",
"ElasticNetRegressor": "._elastic_net_regressor:ElasticNetRegressor",
"GradientBoostingRegressor": "._gradient_boosting_regressor:GradientBoostingRegressor",
Expand All @@ -39,6 +41,7 @@
__all__ = [
"AdaBoostRegressor",
"ArimaModelRegressor",
"BaselineRegressor",
"DecisionTreeRegressor",
"ElasticNetRegressor",
"GradientBoostingRegressor",
Expand Down
Loading