Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

144 add logistic regression #222

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions docs/prediction/logistic_regression.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Logistic regression

::: eis_toolkit.prediction.logistic_regression
92 changes: 92 additions & 0 deletions eis_toolkit/prediction/logistic_regression.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
from typing import Literal, Optional, Tuple, Union

import numpy as np
import pandas as pd
from beartype import beartype
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from eis_toolkit import exceptions


@beartype
def logistic_regression_train(
X: Union[np.ndarray, pd.DataFrame],
y: Union[np.ndarray, pd.Series],
test_size: float = 0.25,
penalty: Literal["l1", "l2", "elasicnet", None] = "l2",
max_iter: int = 100,
random_state: Optional[int] = None,
solver: Literal["lbfgs", "liblinear", "newton-cg", "newton-cholesky", "sag", "saga"] = "lbfgs",
) -> Tuple[LogisticRegression, dict]:
"""
Train a Logistic Regression classifier model using Sklearn.

Trains the model with the given parameters and evaluates model performance using test data.

The choice of the algorithm depends on the penalty chosen. Supported penalties by solver:
'lbfgs' - ['l2', None]
'liblinear' - ['l1', 'l2']
'newton-cg' - ['l2', None]
'newton-cholesky' - ['l2', None]
'sag' - ['l2', None]
'saga' - ['elasticnet', 'l1', 'l2', None]

Args:
X: Training data.
y: Target labels.
test_size: Fraction of the dataset to be used as test data (rest is used for training). Defaults to 0.25.
penalty: Specifies the norm of the penalty. Defaults to 'l2'.
max_iter: Maximum number of iterations taken for the solvers to converge. Defaults to 100.
random_state: Seed for random number generation. Defaults to None.
solver: Algorithm to use in the optimization problem. Defaults to 'lbfgs'.

Returns:
The trained Logistric Regression classifier and details of test set performance.

Raises:
NonMatchingParameterLengthsException: If length of X and y don't match.
InvalidParameterValueException: test_size is not between 0 and 1 or max_iter is less than one.
"""
x_size = X.index.size if isinstance(X, pd.DataFrame) else X.shape[0]
if x_size != y.size:
raise exceptions.NonMatchingParameterLengthsException(f"X and y must have the length {x_size} != {y.size}.")

if not (0 <= test_size <= 1):
raise exceptions.InvalidParameterValueException("Input value for test_size must be between 0 and 1.")

if max_iter < 1:
raise exceptions.InvalidParameterValueException("Input value for max_iter must be > 0.")

# Splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)

model = LogisticRegression(penalty=penalty, max_iter=max_iter, random_state=random_state, solver=solver)

model.fit(X_train, y_train)

# Predictions for test data
y_pred = model.predict(X_test)

# Performance metrics
report = classification_report(y_test, y_pred, output_dict=True)

return model, report


@beartype
def logistic_regression_predict(model: LogisticRegression, X: Union[np.ndarray, pd.DataFrame]) -> np.ndarray:
"""
Use a trained Logistic Regression model to make predictions.

Args:
model: Trained Logistic Regression classifier.
X: Features for which predictions are to be made.

Returns:
Predicted labels.
"""
predictions = model.predict(X)

return predictions
45 changes: 45 additions & 0 deletions tests/prediction/logistic_regression_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import numpy as np
import pytest
from beartype.roar import BeartypeCallHintParamViolation
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression

from eis_toolkit import exceptions
from eis_toolkit.prediction.logistic_regression import logistic_regression_predict, logistic_regression_train

X, y = load_iris(return_X_y=True)


def test_logistic_regression():
"""Test that Logistic Regression works as expected."""
model, report_dict = logistic_regression_train(X, y, random_state=42)
predicted_labels = logistic_regression_predict(model, X)

assert isinstance(model, LogisticRegression)
np.testing.assert_equal(len(predicted_labels), len(y))

# Test that all predicted labels have perfect metric scores since we are predicting with the test data
labels = ["0", "1", "2"]
metrics = ["precision", "recall", "f1-score"]
for label in labels:
for metric in metrics:
np.testing.assert_equal(report_dict[label][metric], 1.0)


def test_logistic_regression_wrong_input_shapes():
"""Test that incorrectly shaped inputs raises the correct exception."""
y_modified = y[:-1]
with pytest.raises(exceptions.NonMatchingParameterLengthsException):
logistic_regression_train(X, y_modified, random_state=42)


def test_invalid_penalty():
"""Test that invalid input value for penalty raises the correct exception."""
with pytest.raises(BeartypeCallHintParamViolation):
logistic_regression_train(X, y, penalty="invalid_penalty")


def test_invalid_max_iter():
"""Test that invalid input value for the maximum number of iterations raises the correct exception."""
with pytest.raises(exceptions.InvalidParameterValueException):
logistic_regression_train(X, y, max_iter=0)
Loading