aimclub · DRMPN · Aug 31, 2024 · Sep 10, 2024 · Sep 10, 2024 · Sep 10, 2024
diff --git a/fedot/api/api_utils/api_composer.py b/fedot/api/api_utils/api_composer.py
@@ -10,7 +10,7 @@
 from fedot.api.api_utils.assumptions.assumptions_handler import AssumptionsHandler
 from fedot.api.api_utils.params import ApiParams
 from fedot.api.time import ApiTime
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.composer.composer_builder import ComposerBuilder
 from fedot.core.composer.gp_composer.gp_composer import GPComposer
@@ -50,6 +50,7 @@ def init_cache(self):
             self.preprocessing_cache = PreprocessingCache(cache_dir)
             #  in case of previously generated singleton cache
             self.preprocessing_cache.reset()
+        # TODO: data_cache
 
     def obtain_model(self, train_data: InputData) -> Tuple[Pipeline, Sequence[Pipeline], OptHistory]:
         """ Function for composing FEDOT pipeline model """
@@ -125,7 +126,7 @@ def compose_pipeline(self, train_data: InputData, initial_assumption: Sequence[P
                                    .with_optimizer(self.params.get('optimizer'))
                                    .with_optimizer_params(parameters=self.params.optimizer_params)
                                    .with_metrics(self.metrics)
-                                   .with_cache(self.pipelines_cache, self.preprocessing_cache)
+                                   .with_cache(self.pipelines_cache, self.preprocessing_cache) # TODO: data_cache
                                    .with_graph_generation_param(self.params.graph_generation_params)
                                    .build())
 

diff --git a/fedot/api/api_utils/assumptions/assumptions_handler.py b/fedot/api/api_utils/assumptions/assumptions_handler.py
@@ -6,7 +6,7 @@
 from fedot.api.api_utils.assumptions.assumptions_builder import AssumptionsBuilder
 from fedot.api.api_utils.presets import change_preset_based_on_initial_fit
 from fedot.api.time import ApiTime
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.data.data import InputData
 from fedot.core.data.data_split import train_test_data_setup
@@ -74,6 +74,7 @@ def fit_assumption_and_check_correctness(self,
                 pipelines_cache.save_pipeline(pipeline)
             if preprocessing_cache is not None:
                 preprocessing_cache.add_preprocessor(pipeline)
+            # TODO: data_cache
 
             pipeline.predict(data_test)
             self.log.info('Initial pipeline was fitted successfully')

diff --git a/fedot/core/caching/base_cache.py b/fedot/core/caching/base_cache.py
@@ -3,7 +3,7 @@
 from golem.core.log import default_log
 from golem.utilities.singleton_meta import SingletonMeta
 
-from fedot.core.caching.pipelines_cache_db import OperationsCacheDB
+from fedot.core.caching.operations_cache_db import OperationsCacheDB
 from fedot.core.caching.preprocessing_cache_db import PreprocessingCacheDB
 
 

diff --git a/fedot/core/caching/data_cache.py b/fedot/core/caching/data_cache.py
@@ -0,0 +1,105 @@
+import sqlite3
+from typing import TYPE_CHECKING, List, Optional, Union
+
+import numpy as np
+
+from fedot.core.caching.base_cache import BaseCache
+from fedot.core.caching.data_cache_db import DataCacheDB
+from fedot.core.data.data import OutputData
+
+if TYPE_CHECKING:
+    from fedot.core.pipelines.pipeline import Pipeline
+
+
+class DataCache(BaseCache):
+    """
+    Stores/loads predictions to increase performance of calculations.
+
+    :param cache_dir: path to the place where cache files should be stored.
+    """
+
+    def __init__(self, cache_dir: Optional[str] = None, custom_pid=None):
+        super().__init__(DataCacheDB(cache_dir, custom_pid))
+
+    def save_prediction(self, prediction: np.ndarray, uid: str):
+        """
+        Save the prediction for a given UID.
+
+        :param prediction (np.ndarray): The prediction to be saved.
+        :param uid (str): The unique identifier for the prediction.
+        """
+        try:
+            self._db.add_prediction([(uid, prediction)])
+        except Exception as ex:
+            unexpected_exc = not (
+                isinstance(ex, sqlite3.DatabaseError) and "disk is full" in str(ex)
+            )
+            self.log.warning(
+                f"Predictions can not be saved: {ex}. Continue",
+                exc=ex,
+                raise_if_test=unexpected_exc,
+            )
+
+    def load_prediction(self, uid: str) -> np.ndarray:
+        """
+        Load the prediction data for the given unique identifier.
+        :param uid (str): The unique identifier of the prediction data.
+        :return np.ndarray: The loaded prediction data.
+        """
+        predict = self._db.get_prediction(uid)
+        # TODO: restore OutputData from predict
+        return predict
+
+    def save_data(
+        self,
+        pipeline: "Pipeline",
+        outputData: OutputData,
+        fold_id: Optional[int] = None,
+    ):
+        """
+        Save the pipeline data to the cache.
+
+        :param pipeline: The pipeline data to be cached.
+        :type pipeline: Pipeline
+        :param outputData: The output data to be saved.
+        :type outputData: OutputData
+        :param fold_id: Optional part of the cache item UID (can be used to specify the number of CV fold).
+        :type fold_id: Optional[int]
+        """
+        uid = self._create_uid(pipeline, fold_id)
+        # TODO: save OutputData as a whole to the cache
+        self.save_prediction(outputData.predict, uid)
+
+    def try_load_data(
+        self, pipeline: "Pipeline", fold_id: Optional[int] = None
+    ) -> OutputData:
+        # create parameter dosctring
+        """
+        Try to load data for the given pipeline and fold ID.
+
+        :param  pipeline (Pipeline): The pipeline for which to load the data.
+        :param fold_id (Optional[int]): The fold ID for which to load the data. Defaults to None.
+        :return OutputData: The loaded data.
+        """
+        # TODO: implement loading of pipeline data
+        uid = self._create_uid(pipeline, fold_id)
+        self.load_prediction(uid)
+
+    def _create_uid(
+        self,
+        pipeline: "Pipeline",
+        fold_id: Optional[int] = None,
+    ) -> str:
+        """
+        Generate a unique identifier for a pipeline.
+
+        :param pipeline (Pipeline): The pipeline for which the unique identifier is generated.
+        :param fold_id (Optional[int]): The fold ID (default: None).
+        :return str: The unique identifier generated for the pipeline.
+        """
+        base_uid = ""
+        for node in pipeline.nodes:
+            base_uid += f"{node.descriptive_id}_"
+        if fold_id is not None:
+            base_uid += f"{fold_id}"
+        return base_uid
diff --git a/fedot/core/caching/data_cache_db.py b/fedot/core/caching/data_cache_db.py
@@ -0,0 +1,90 @@
+import pickle
+import sqlite3
+from contextlib import closing
+from os import getpid
+from typing import List, Optional, Tuple, TypeVar
+
+import numpy as np
+
+from fedot.core.caching.base_cache_db import BaseCacheDB
+
+
+class DataCacheDB(BaseCacheDB):
+    """
+    Database for `DataCache` class.
+    Includes low-level idea of caching predicted output using relational database.
+
+    :param cache_dir: path to the place where cache files should be stored.
+    """
+
+    def __init__(self, cache_dir: Optional[str] = None, custom_pid=None):
+        super().__init__("prediction", cache_dir)
+        self._init_db()
+
+    def add_prediction(self, uid_val_lst: List[Tuple[str, np.ndarray]]):
+        """
+        Adds operation score to DB table via its uid
+
+        :param uid_val_lst: list of pairs (uid -> prediction) to be saved
+        """
+        try:
+            with closing(sqlite3.connect(self.db_path)) as conn:
+                with conn:
+                    cur = conn.cursor()
+                    pickled = [
+                        (
+                            uid,
+                            sqlite3.Binary(pickle.dumps(val, pickle.HIGHEST_PROTOCOL)),
+                        )
+                        for uid, val in uid_val_lst
+                    ]
+                    cur.executemany(
+                        f"INSERT OR IGNORE INTO {self._main_table} VALUES (?, ?);",
+                        pickled,
+                    )
+        except sqlite3.Error as e:
+            print(f"SQLite error: {e}")
+
+    def get_prediction(self, uids: List[str]) -> List[Optional[np.ndarray]]:
+        """
+        Maps given uids to operations from DB and puts None if is not present.
+
+        :param uids: list of operations uids to be mapped
+
+        :return retrieved: list of operations taken from DB table with None where it wasn't present
+        """
+        try:
+            with closing(sqlite3.connect(self.db_path)) as conn:
+                with conn:
+                    cur = conn.cursor()
+                    placeholders = ",".join("?" for _ in uids)
+                    query = (
+                        f"SELECT id, prediction FROM {self._main_table} "
+                        f"WHERE id IN ({placeholders})"
+                    )
+                    cur.execute(query, uids)
+                    results = {row[0]: pickle.loads(row[1]) for row in cur.fetchall()}
+                    retrieved = [results.get(uid) for uid in uids]
+            return retrieved
+        except sqlite3.Error as e:
+            print(f"SQLite error: {e}")
+            return [None] * len(uids)
+
+    def _init_db(self):
+        """
+        Initializes DB working table.
+        """
+        try:
+            with closing(sqlite3.connect(self.db_path)) as conn:
+                with conn:
+                    cur = conn.cursor()
+                    cur.execute(
+                        (
+                            f"CREATE TABLE IF NOT EXISTS {self._main_table} ("
+                            "id TEXT PRIMARY KEY,"
+                            "prediction BLOB"
+                            ");"
+                        )
+                    )
+        except sqlite3.Error as e:
+            print(f"SQLite error: {e}")
diff --git a/fedot/core/caching/pipelines_cache.py → fedot/core/caching/operations_cache.py b/fedot/core/caching/pipelines_cache.py → fedot/core/caching/operations_cache.py
@@ -4,7 +4,7 @@
 from golem.utilities.data_structures import ensure_wrapped_in_sequence
 
 from fedot.core.caching.base_cache import BaseCache
-from fedot.core.caching.pipelines_cache_db import OperationsCacheDB
+from fedot.core.caching.operations_cache_db import OperationsCacheDB
 from fedot.core.pipelines.node import PipelineNode
 
 if TYPE_CHECKING:

diff --git a/fedot/core/caching/pipelines_cache_db.py → fedot/core/caching/operations_cache_db.py b/fedot/core/caching/pipelines_cache_db.py → fedot/core/caching/operations_cache_db.py
diff --git a/fedot/core/composer/composer_builder.py b/fedot/core/composer/composer_builder.py
@@ -10,7 +10,7 @@
 from golem.core.optimisers.optimizer import AlgorithmParameters, GraphGenerationParams, GraphOptimizer
 from golem.utilities.data_structures import ensure_wrapped_in_sequence
 
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.composer.composer import Composer
 from fedot.core.composer.gp_composer.gp_composer import GPComposer
@@ -55,6 +55,7 @@ def __init__(self, task: Task):
 
         self.pipelines_cache: Optional[OperationsCache] = None
         self.preprocessing_cache: Optional[PreprocessingCache] = None
+        # TODO: self.data_cache: Optional[DataCache] = None
 
     def with_composer(self, composer_cls: Optional[Type[Composer]]):
         if composer_cls is not None:
@@ -100,6 +101,7 @@ def with_cache(self, pipelines_cache: Optional[OperationsCache] = None,
                    preprocessing_cache: Optional[PreprocessingCache] = None):
         self.pipelines_cache = pipelines_cache
         self.preprocessing_cache = preprocessing_cache
+        # TODO: self.data_cache = data_cache
         return self
 
     @staticmethod

diff --git a/fedot/core/composer/gp_composer/gp_composer.py b/fedot/core/composer/gp_composer/gp_composer.py
@@ -3,7 +3,7 @@
 from golem.core.optimisers.graph import OptGraph
 from golem.core.optimisers.optimizer import GraphOptimizer
 
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.composer.composer import Composer
 from fedot.core.data.data import InputData

diff --git a/fedot/core/optimisers/objective/data_objective_eval.py b/fedot/core/optimisers/objective/data_objective_eval.py
@@ -8,7 +8,7 @@
 from golem.core.optimisers.objective.objective import Objective, to_fitness
 from golem.core.optimisers.objective.objective_eval import ObjectiveEvaluate
 
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.data.data import InputData
 from fedot.core.operations.model import Model
@@ -49,6 +49,7 @@ def __init__(self,
         self._validation_blocks = validation_blocks
         self._pipelines_cache = pipelines_cache
         self._preprocessing_cache = preprocessing_cache
+        # TODO: self._data_cache = data_cache
         self._log = default_log(self)
         self._do_unfit = do_unfit
 
@@ -111,7 +112,7 @@ def prepare_graph(self, graph: Pipeline, train_data: InputData,
 
         # load preprocessing
         graph.try_load_from_cache(self._pipelines_cache, self._preprocessing_cache, fold_id)
-        graph.fit(
+        predicted_train = graph.fit(
             train_data,
             n_jobs=n_jobs,
             time_constraint=self._time_constraint
@@ -121,6 +122,9 @@ def prepare_graph(self, graph: Pipeline, train_data: InputData,
             self._pipelines_cache.save_pipeline(graph, fold_id)
         if self._preprocessing_cache is not None:
             self._preprocessing_cache.add_preprocessor(graph, fold_id)
+        # TODO: 
+        # if self._data_cache is not None:
+            # self._data_cache.save_data(graph, predicted_train, fold_id)
 
         return graph
 

diff --git a/fedot/core/pipelines/node.py b/fedot/core/pipelines/node.py
@@ -195,6 +195,8 @@ def fit(self, input_data: InputData) -> OutputData:
 
         input_data = self._get_input_data(input_data=input_data, parent_operation='fit')
 
+        # TODO: try load from cache
+
         if self.fitted_operation is None:
             with Timer() as t:
                 self.fitted_operation, operation_predict = self.operation.fit(params=self._parameters,
@@ -233,6 +235,9 @@ def predict(self, input_data: InputData, output_mode: str = 'default') -> Output
                                                        data=input_data,
                                                        output_mode=output_mode)
             self.inference_time_in_seconds = round(t.seconds_from_start, 3)
+
+        # TODO: save predict to cache
+
         return operation_predict
 
     def get_data_from_node(self) -> dict:

diff --git a/fedot/core/pipelines/pipeline.py b/fedot/core/pipelines/pipeline.py
@@ -15,7 +15,7 @@
 from golem.utilities.serializable import Serializable
 from golem.visualisation.graph_viz import NodeColorType
 
-from fedot.core.caching.pipelines_cache import OperationsCache
+from fedot.core.caching.operations_cache import OperationsCache
 from fedot.core.caching.preprocessing_cache import PreprocessingCache
 from fedot.core.data.data import InputData, OutputData
 from fedot.core.data.multi_modal import MultiModalData
@@ -252,6 +252,7 @@ def try_load_from_cache(self, cache: Optional[OperationsCache], preprocessing_ca
             cache.try_load_into_pipeline(self, fold_id)
         if preprocessing_cache is not None:
             preprocessing_cache.try_load_preprocessor(self, fold_id)
+        # TODO: data_cache
 
     def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str = 'default') -> OutputData:
         """Runs the predict process in all of the pipeline nodes starting with root
@@ -270,6 +271,8 @@ def predict(self, input_data: Union[InputData, MultiModalData], output_mode: str
             OutputData: values predicted on the provided ``input_data``
         """
 
+        # TODO: data_cache
+
         if not self.is_fitted:
             ex = 'Pipeline is not fitted yet'
             self.log.error(ex)