Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add linear regression #15

Merged
merged 23 commits into from
Jan 31, 2025
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions .github/workflows/sonarcloud.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,27 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0 # Shallow clones should be disabled for a better relevancy of analysis
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.10"
- name: Install dependencies
# it should be in editable mode for coverage to work
run: |
python -m pip install --upgrade pip
python -m pip install -e .[dev]
- name: Run unit tests and store coverage
run: |
coverage run
coverage report -m
coverage xml -o coverage.xml
- name: Make coverage paths relative
run: sed -i "s+$PWD/++g" coverage.xml
- name: SonarQube Scan
uses: SonarSource/sonarqube-scan-action@v4
with:
args: >
-Dsonar.verbose=true
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
85 changes: 70 additions & 15 deletions docs/notebooks/compare_linear_models.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,13 @@
" 'spd_rot_act',\n",
" 'wind_speed'],\n",
" 'save_grid_scores': True,\n",
" 'save_best_model': True}"
" 'save_best_model': True,\n",
" 'n_jobs': 2,\n",
" 'use_gpu': False,\n",
" 'train_size': 0.7,\n",
" 'models': ['en', 'lar', 'llar', 'lasso', 'lr', 'ridge', 'omp', 'ransac'],\n",
" 'metrics_sort': 'R2',\n",
" 'system_log': './logs.log'}"
]
},
"execution_count": 6,
Expand All @@ -331,50 +337,99 @@
{
"cell_type": "code",
"execution_count": 7,
"id": "b48f2669-1143-4db0-bfeb-8e7fe51a594a",
"id": "03987d96-eb24-4d7b-972b-7d5b1688ed67",
"metadata": {},
"outputs": [],
"source": [
"# change some settings\n",
"my_pipeline.config[\"ml_setup\"][\"save_best_model\"] = False"
"import mlflow\n",
"import mlflow.sklearn\n",
"from sklearn.model_selection import train_test_split\n",
"from fowt_ml.linear_models import LinearModels"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "ac7a38fe-8832-4a06-8822-70bbe03ae335",
"execution_count": 11,
"id": "19caa398-431d-4ab0-827d-26118966c95a",
"metadata": {},
"outputs": [],
"source": [
"my_pipeline.setup(df)"
"def train_and_log_model(model_name):\n",
" with mlflow.start_run():\n",
" process = LinearModels(model_name)\n",
" rmse = process.calculate_metric(X_train, X_test, y_train, y_test, \"root_mean_squared_error\") \n",
" model = process.model\n",
" \n",
" # Log model parameters and metrics\n",
" mlflow.log_param(\"model_name\", model_name)\n",
" mlflow.log_param(\"n_estimators\", model.n_estimators if hasattr(model, 'n_estimators') else None)\n",
" mlflow.log_param(\"max_depth\", model.max_depth if hasattr(model, 'max_depth') else None)\n",
" mlflow.log_metric(\"rmse\", rmse)\n",
" \n",
" # Log the model itself\n",
" mlflow.sklearn.log_model(model, model_name)\n",
" \n",
" return rmse"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "2ea35699-f17a-4a8f-be92-3a2ed29b5bed",
"execution_count": 12,
"id": "bbf83157-0343-4a5a-af66-2913396811f3",
"metadata": {},
"outputs": [],
"source": [
"# Dictionary to store model performances\n",
"model_performances = {}\n",
"\n",
"predictors_labels = my_pipeline.config[\"ml_setup\"][\"predictors\"]\n",
"target_labels = my_pipeline.config[\"ml_setup\"][\"target\"]\n",
"\n",
"X_data = df[predictors_labels]\n",
"Y_data = df[target_labels]\n",
"\n",
"X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.75, shuffle=False, random_state=123)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "e19dd7bf-0253-44de-a280-429b03e77625",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"\u001b[31m2025/01/22 14:19:20 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
"\u001b[31m2025/01/22 14:19:23 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
"\u001b[31m2025/01/22 14:19:25 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
"\u001b[31m2025/01/22 14:19:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ElasticNet(random_state=123)\n",
"CPU times: user 4.57 s, sys: 1.93 s, total: 6.5 s\n",
"Wall time: 23.1 s\n"
"{'LeastAngleRegression': 3.5914625711937265, 'LinearRegression': 3.5914560337640307, 'LassoRegression': 3.733306142704239, 'RidgeRegression': 3.5914586161568707}\n",
"CPU times: user 6.68 s, sys: 203 ms, total: 6.88 s\n",
"Wall time: 10.7 s\n"
]
}
],
"source": [
"%%time\n",
"best_model = my_pipeline.compare_models()\n",
"print(best_model)"
"# Train, log models, and compare performance\n",
"for name in [\"LeastAngleRegression\",\"LinearRegression\", \"LassoRegression\", \"RidgeRegression\"]:\n",
" rmse = train_and_log_model(name)\n",
" model_performances[name] = rmse\n",
"print(model_performances)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f56cf79a-e94e-4285-a673-3627fd106f01",
"id": "66edce0f-32a0-4583-b8b1-c77969e2db9e",
"metadata": {},
"outputs": [],
"source": []
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ classifiers = [
]
dependencies = [
"h5py",
"mlflow",
"numpy",
"pandas",
"pyyaml",
Expand Down
6 changes: 2 additions & 4 deletions sonar-project.properties
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
sonar.organization=hybridlabs-nl
sonar.projectKey=hybridlabs-nl_FOWT-ML
sonar.host.url=https://sonarcloud.io
sonar.sources=src/fowt_ml/
sonar.tests=tests/
sonar.sources=src/fowt_ml
sonar.tests=tests
sonar.links.homepage=https://github.com/hybridlabs-nl/FOWT-ML
[email protected]:hybridlabs-nl/FOWT-ML
sonar.links.issue=https://github.com/hybridlabs-nl/FOWT-ML/issues
sonar.links.ci=https://github.com/hybridlabs-nl/FOWT-ML/actions
sonar.python.coverage.reportPaths=coverage.xml
sonar.python.xunit.reportPath=xunit-result.xml
sonar.python.pylint.reportPaths=pylint-report.txt
100 changes: 100 additions & 0 deletions src/fowt_ml/linear_models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
"""Module to handle linear models."""

from dataclasses import dataclass
import sklearn.linear_model as lm
import sklearn.metrics as sm
from numpy.typing import ArrayLike


@dataclass
class Estimator:
name: str
func: callable
fnattino marked this conversation as resolved.
Show resolved Hide resolved
reference: str = "sklearn"


@dataclass
class Metric:
name: str
func: callable
reference: str = "sklearn"


class LinearModels:
"""Class to handle linear models and metrics for comparison."""

ESTIMATOR_NAMES = {
"LinearRegression": lm.LinearRegression(),
"RidgeRegression": lm.Ridge(),
"LassoRegression": lm.Lasso(),
"ElasticNetRegression": lm.ElasticNet(),
"LeastAngleRegression": lm.Lars(),
}
METRICS_NAMES = {
"root_mean_squared_error": sm.root_mean_squared_error,
"mean_squared_error": sm.mean_squared_error,
"r2_score": sm.r2_score,
"mean_absolute_error": sm.mean_absolute_error,
}

def __init__(self, estimator: str | Estimator, kwargs: dict = None) -> None:
fnattino marked this conversation as resolved.
Show resolved Hide resolved
"""Initialize the class with the estimator."""
if isinstance(estimator, str):
if not self.ESTIMATOR_NAMES.get(estimator):
msg = (
f"estimator {estimator} not supported. "
f"Choose one of {list(self.ESTIMATOR_NAMES.keys())}"
f"or pass a Estimator instance."
)
raise ValueError(msg)
self.estimator = Estimator(estimator, self.ESTIMATOR_NAMES.get(estimator))
elif isinstance(estimator, Estimator):
self.estimator = estimator
# TODO: validate if model function is a callable and valid
else:
raise ValueError("model must be a string or a Estimator instance.")

# fill the estimator with the kwargs
if kwargs:
for key, value in kwargs.items():
setattr(self.estimator.func, key, value)
fnattino marked this conversation as resolved.
Show resolved Hide resolved

def calculate_metric(
self,
x_train: ArrayLike,
x_test: ArrayLike,
y_train: ArrayLike,
y_test: ArrayLike,
metric: str | Metric,
) -> float:
"""Calculate the metric for the model using test data.

First it fits the model with the training data, then predicts the test
data

Args:
x_train (ArrayLike): training data for features
x_test (ArrayLike): test data for features
y_train (ArrayLike): training data for targets
y_test (ArrayLike): test data for targets
metric (Union[str, Metric]): the metric to calculate

Returns:
float: the metric value
"""
if isinstance(metric, str):
if not self.METRICS_NAMES.get(metric):
msg = (
f"metric {metric} not supported. "
f"Choose one of {list(self.METRICS_NAMES.keys())}"
f"or pass a Metric instance."
)
raise ValueError(msg)
self.metric = Metric(metric, self.METRICS_NAMES.get(metric))
elif not isinstance(metric, Metric):
raise ValueError("metric must be a string or a Metric instance.")

# TODO: check other arguments of fit, predict and metric functions
self.model = self.estimator.func.fit(x_train, y_train)
self.y_pred = self.model.predict(x_test)
return self.metric.func(y_test, self.y_pred)
Loading
Loading