hybridlabs-nl · SarahAlidoost · Jan 31, 2025 · Jan 17, 2025 · Jan 21, 2025 · Jan 21, 2025
diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml
@@ -17,8 +17,27 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0  # Shallow clones should be disabled for a better relevancy of analysis
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+      - name: Install dependencies
+      # it should be in editable mode for coverage to work
+        run: |
+          python -m pip install --upgrade pip
+          python -m pip install -e .[dev]
+      - name: Run unit tests and store coverage
+        run: |
+          coverage run
+          coverage report -m
+          coverage xml -o coverage.xml
+      - name: Make coverage paths relative
+        run: sed -i "s+$PWD/++g" coverage.xml
       - name: SonarQube Scan
         uses: SonarSource/sonarqube-scan-action@v4
+        with:
+          args: >
+            -Dsonar.verbose=true
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           SONAR_TOKEN: ${{ secrets.SONAR_TOKEN }}
diff --git a/docs/notebooks/compare_linear_models.ipynb b/docs/notebooks/compare_linear_models.ipynb
@@ -315,7 +315,13 @@
        "  'spd_rot_act',\n",
        "  'wind_speed'],\n",
        " 'save_grid_scores': True,\n",
-       " 'save_best_model': True}"
+       " 'save_best_model': True,\n",
+       " 'n_jobs': 2,\n",
+       " 'use_gpu': False,\n",
+       " 'train_size': 0.7,\n",
+       " 'models': ['en', 'lar', 'llar', 'lasso', 'lr', 'ridge', 'omp', 'ransac'],\n",
+       " 'metrics_sort': 'R2',\n",
+       " 'system_log': './logs.log'}"
       ]
      },
      "execution_count": 6,
@@ -331,50 +337,99 @@
   {
    "cell_type": "code",
    "execution_count": 7,
-   "id": "b48f2669-1143-4db0-bfeb-8e7fe51a594a",
+   "id": "03987d96-eb24-4d7b-972b-7d5b1688ed67",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# change some settings\n",
-    "my_pipeline.config[\"ml_setup\"][\"save_best_model\"] = False"
+    "import mlflow\n",
+    "import mlflow.sklearn\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from fowt_ml.linear_models import LinearModels"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "ac7a38fe-8832-4a06-8822-70bbe03ae335",
+   "execution_count": 11,
+   "id": "19caa398-431d-4ab0-827d-26118966c95a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "my_pipeline.setup(df)"
+    "def train_and_log_model(model_name):\n",
+    "    with mlflow.start_run():\n",
+    "        process = LinearModels(model_name)\n",
+    "        rmse = process.calculate_metric(X_train, X_test, y_train, y_test, \"root_mean_squared_error\") \n",
+    "        model = process.model\n",
+    "        \n",
+    "        # Log model parameters and metrics\n",
+    "        mlflow.log_param(\"model_name\", model_name)\n",
+    "        mlflow.log_param(\"n_estimators\", model.n_estimators if hasattr(model, 'n_estimators') else None)\n",
+    "        mlflow.log_param(\"max_depth\", model.max_depth if hasattr(model, 'max_depth') else None)\n",
+    "        mlflow.log_metric(\"rmse\", rmse)\n",
+    "        \n",
+    "        # Log the model itself\n",
+    "        mlflow.sklearn.log_model(model, model_name)\n",
+    "        \n",
+    "        return rmse"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "2ea35699-f17a-4a8f-be92-3a2ed29b5bed",
+   "execution_count": 12,
+   "id": "bbf83157-0343-4a5a-af66-2913396811f3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Dictionary to store model performances\n",
+    "model_performances = {}\n",
+    "\n",
+    "predictors_labels = my_pipeline.config[\"ml_setup\"][\"predictors\"]\n",
+    "target_labels = my_pipeline.config[\"ml_setup\"][\"target\"]\n",
+    "\n",
+    "X_data = df[predictors_labels]\n",
+    "Y_data = df[target_labels]\n",
+    "\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=0.75, shuffle=False, random_state=123)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "e19dd7bf-0253-44de-a280-429b03e77625",
    "metadata": {},
    "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\u001b[31m2025/01/22 14:19:20 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
+      "\u001b[31m2025/01/22 14:19:23 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
+      "\u001b[31m2025/01/22 14:19:25 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n",
+      "\u001b[31m2025/01/22 14:19:27 WARNING mlflow.models.model: Model logged without a signature and input example. Please set `input_example` parameter when logging the model to auto infer the model signature.\u001b[0m\n"
+     ]
+    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "ElasticNet(random_state=123)\n",
-      "CPU times: user 4.57 s, sys: 1.93 s, total: 6.5 s\n",
-      "Wall time: 23.1 s\n"
+      "{'LeastAngleRegression': 3.5914625711937265, 'LinearRegression': 3.5914560337640307, 'LassoRegression': 3.733306142704239, 'RidgeRegression': 3.5914586161568707}\n",
+      "CPU times: user 6.68 s, sys: 203 ms, total: 6.88 s\n",
+      "Wall time: 10.7 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "best_model = my_pipeline.compare_models()\n",
-    "print(best_model)"
+    "# Train, log models, and compare performance\n",
+    "for name in [\"LeastAngleRegression\",\"LinearRegression\", \"LassoRegression\", \"RidgeRegression\"]:\n",
+    "    rmse = train_and_log_model(name)\n",
+    "    model_performances[name] = rmse\n",
+    "print(model_performances)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f56cf79a-e94e-4285-a673-3627fd106f01",
+   "id": "66edce0f-32a0-4583-b8b1-c77969e2db9e",
    "metadata": {},
    "outputs": [],
    "source": []

diff --git a/pyproject.toml b/pyproject.toml
@@ -26,6 +26,7 @@ classifiers = [
 ]
 dependencies = [
     "h5py",
+    "mlflow",
     "numpy",
     "pandas",
     "pyyaml",

diff --git a/sonar-project.properties b/sonar-project.properties
@@ -1,12 +1,10 @@
 sonar.organization=hybridlabs-nl
 sonar.projectKey=hybridlabs-nl_FOWT-ML
 sonar.host.url=https://sonarcloud.io
-sonar.sources=src/fowt_ml/
-sonar.tests=tests/
+sonar.sources=src/fowt_ml
+sonar.tests=tests
 sonar.links.homepage=https://github.com/hybridlabs-nl/FOWT-ML
 [email protected]:hybridlabs-nl/FOWT-ML
 sonar.links.issue=https://github.com/hybridlabs-nl/FOWT-ML/issues
 sonar.links.ci=https://github.com/hybridlabs-nl/FOWT-ML/actions
 sonar.python.coverage.reportPaths=coverage.xml
-sonar.python.xunit.reportPath=xunit-result.xml
-sonar.python.pylint.reportPaths=pylint-report.txt
diff --git a/src/fowt_ml/linear_models.py b/src/fowt_ml/linear_models.py
@@ -0,0 +1,100 @@
+"""Module to handle linear models."""
+
+from dataclasses import dataclass
+import sklearn.linear_model as lm
+import sklearn.metrics as sm
+from numpy.typing import ArrayLike
+
+
+@dataclass
+class Estimator:
+    name: str
+    func: callable
+    reference: str = "sklearn"
+
+
+@dataclass
+class Metric:
+    name: str
+    func: callable
+    reference: str = "sklearn"
+
+
+class LinearModels:
+    """Class to handle linear models and metrics for comparison."""
+
+    ESTIMATOR_NAMES = {
+        "LinearRegression": lm.LinearRegression(),
+        "RidgeRegression": lm.Ridge(),
+        "LassoRegression": lm.Lasso(),
+        "ElasticNetRegression": lm.ElasticNet(),
+        "LeastAngleRegression": lm.Lars(),
+    }
+    METRICS_NAMES = {
+        "root_mean_squared_error": sm.root_mean_squared_error,
+        "mean_squared_error": sm.mean_squared_error,
+        "r2_score": sm.r2_score,
+        "mean_absolute_error": sm.mean_absolute_error,
+    }
+
+    def __init__(self, estimator: str | Estimator, kwargs: dict = None) -> None:
+        """Initialize the class with the estimator."""
+        if isinstance(estimator, str):
+            if not self.ESTIMATOR_NAMES.get(estimator):
+                msg = (
+                    f"estimator {estimator} not supported. "
+                    f"Choose one of {list(self.ESTIMATOR_NAMES.keys())}"
+                    f"or pass a Estimator instance."
+                )
+                raise ValueError(msg)
+            self.estimator = Estimator(estimator, self.ESTIMATOR_NAMES.get(estimator))
+        elif isinstance(estimator, Estimator):
+            self.estimator = estimator
+            # TODO: validate if model function is a callable and valid
+        else:
+            raise ValueError("model must be a string or a Estimator instance.")
+
+        # fill the estimator with the kwargs
+        if kwargs:
+            for key, value in kwargs.items():
+                setattr(self.estimator.func, key, value)
+
+    def calculate_metric(
+        self,
+        x_train: ArrayLike,
+        x_test: ArrayLike,
+        y_train: ArrayLike,
+        y_test: ArrayLike,
+        metric: str | Metric,
+    ) -> float:
+        """Calculate the metric for the model using test data.
+
+        First it fits the model with the training data, then predicts the test
+        data
+
+        Args:
+            x_train (ArrayLike): training data for features
+            x_test (ArrayLike): test data for features
+            y_train (ArrayLike): training data for targets
+            y_test (ArrayLike): test data for targets
+            metric (Union[str, Metric]): the metric to calculate
+
+        Returns:
+            float: the metric value
+        """
+        if isinstance(metric, str):
+            if not self.METRICS_NAMES.get(metric):
+                msg = (
+                    f"metric {metric} not supported. "
+                    f"Choose one of {list(self.METRICS_NAMES.keys())}"
+                    f"or pass a Metric instance."
+                )
+                raise ValueError(msg)
+            self.metric = Metric(metric, self.METRICS_NAMES.get(metric))
+        elif not isinstance(metric, Metric):
+            raise ValueError("metric must be a string or a Metric instance.")
+
+        # TODO: check other arguments of fit, predict and metric functions
+        self.model = self.estimator.func.fit(x_train, y_train)
+        self.y_pred = self.model.predict(x_test)
+        return self.metric.func(y_test, self.y_pred)