Merge pull request #102 from kkovary/101-in-memory-model-loading

101 in memory model loading
datamol-io · Aug 15, 2024 · 40414ae · 40414ae
2 parents 9f6062b + 0a62bc5
commit 40414ae
Show file tree

Hide file tree

Showing 6 changed files with 178 additions and 14 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,21 +1,21 @@
 repos:
-- repo: https://github.com/psf/black
-  rev: "24.8.0" # run `pre-commit autoupdate`
-  hooks:
-    -   id: black
+  - repo: https://github.com/psf/black
+    rev: "24.8.0" # run `pre-commit autoupdate`
+    hooks:
+      - id: black
         name: black
         entry: black
-        args: 
+        args:
           - --check
         language: system
         types: [python]
 
-- repo: https://github.com/astral-sh/ruff-pre-commit
-  rev: "v0.5.7"
-  hooks:
-    -   id: ruff
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: "v0.5.7"
+    hooks:
+      - id: ruff
         name: ruff
         entry: ruff
-        args: ["check"] 
+        args: ["check"]
         language: system
-        types: [python]
+        types: [python]
diff --git a/docs/tutorials/loading_models_without_disk_storage.ipynb b/docs/tutorials/loading_models_without_disk_storage.ipynb
@@ -0,0 +1,88 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Loading Models Without Disk Storage Using `InMemoryModelStore`\n",
+    "\n",
+    "## Introduction\n",
+    "This tutorial introduces the `InMemoryModelStore` class, an alternative to `ModelStore` designed for environments with limited disk space. This approach is particularly beneficial when memory is more readily available than disk space or disk access is limited or slow.\n",
+    "\n",
+    "`InMemoryModelStore` enables model loading directly into memory, bypassing the need for local storage and disk I/O operations.\n",
+    "\n",
+    "## Using InMemoryModelStore\n",
+    "Here's how to use `InMemoryModelStore`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from molfeat.store import InMemoryModelStore\n",
+    "\n",
+    "# Initialize the in-memory store\n",
+    "store = InMemoryModelStore(model_store_bucket='s3://my-modelstore-bucket')\n",
+    "\n",
+    "# Load a model directly into memory\n",
+    "model, model_info = store.load('My-Model')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The model is now ready for use without any disk I/O overhead"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import datamol as dm\n",
+    "from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer\n",
+    "\n",
+    "smiles = dm.freesolv().iloc[:100].smiles\n",
+    "\n",
+    "transformer = PretrainedHFTransformer(kind=model, notation=model_info.inputs, dtype=float)\n",
+    "features = transformer(smiles)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Benefits and Considerations\n",
+    "- **Reduced Latency**: By loading models directly into memory, you eliminate the time needed for disk reads.\n",
+    "- **Efficient Resource Use**: Ideal for serverless environments where disk access might be limited or costly.\n",
+    "\n",
+    "However, keep in mind that this approach requires sufficient memory to hold the entire model. Ensure your deployment environment has adequate RAM for your model size."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -27,6 +27,8 @@ nav:
           - Finetuning a pretrained transformer: tutorials/transformer_finetuning.ipynb
       - Save and Load: tutorials/save_and_load.ipynb
       - The Data Cache: tutorials/datacache.ipynb
+      - Model Handling:
+          - Loading Models Without Disk Storage: tutorials/loading_models_without_disk_storage.ipynb
 
   - Developers:
       - Contributing: developers/contribute.md
@@ -97,7 +99,7 @@ watch:
 
 plugins:
   - search
-  
+
   - mkdocstrings:
       handlers:
         python:

diff --git a/molfeat/store/__init__.py b/molfeat/store/__init__.py
@@ -1,3 +1,4 @@
 from .modelcard import ModelInfo
 from .modelstore import ModelStore
+from .modelstore import InMemoryModelStore
 from .modelstore import ModelStoreError
diff --git a/molfeat/store/modelstore.py b/molfeat/store/modelstore.py
@@ -1,7 +1,8 @@
 import os
 import pathlib
 import tempfile
-from typing import Any, Callable, Optional, Union
+from typing import Any, Callable, Dict, Optional, Union
+import io
 
 import datamol as dm
 import filelock
@@ -333,3 +334,75 @@ def search(self, modelcard: Optional[ModelInfo] = None, **search_kwargs):
             if model.match(search_infos, match_only=list(search_infos.keys())):
                 found.append(model)
         return found
+
+
+class InMemoryModelStore(ModelStore):
+    """A class for loading models directly into memory from ModelStore"""
+
+    def download(
+        self,
+        modelcard: ModelInfo,
+    ) -> Dict[str, io.BytesIO]:
+        """Download an artifact into memory
+
+        Args:
+            modelcard: information on the model to download
+
+        Returns:
+            A dictionary with file names as keys and file-like objects as values.
+        """
+        remote_dir = modelcard.path(self.model_store_bucket)
+        model_name = modelcard.name
+        if not self.exists(modelcard, check_remote=True):
+            raise ModelStoreError(f"Model {model_name} does not exist in the model store!")
+
+        model_remote_path = dm.fs.join(remote_dir, self.MODEL_PATH_NAME)
+        metadata_remote_path = dm.fs.join(remote_dir, self.METADATA_PATH_NAME)
+
+        model_data = {}
+
+        # Download metadata
+        with fsspec.open(metadata_remote_path, "rb") as IN:
+            model_data[self.METADATA_PATH_NAME] = io.BytesIO(IN.read())
+
+        # Download model
+        with fsspec.open(model_remote_path, "rb") as IN:
+            model_data[self.MODEL_PATH_NAME] = io.BytesIO(IN.read())
+
+        return model_data
+
+    def load(
+        self,
+        model_name: Union[str, dict, ModelInfo],
+        load_fn: Optional[Callable] = None,
+        load_fn_kwargs: Optional[dict] = None,
+    ):
+        """
+        Load a model by its name
+
+        Args:
+            model_name: name of the model to load
+            load_fn: Custom loading function to load the model
+            load_fn_kwargs: Optional dict of additional kwargs to provide to the loading function
+
+        Returns:
+            model: Optional model, if the model requires download or loading weights
+            model_info: model information card
+        """
+        if isinstance(model_name, str):
+            modelcard = self.search(name=model_name)[0]
+        else:
+            modelcard = model_name
+
+        model_data = self.download(
+            modelcard=modelcard,
+        )
+        if load_fn is None:
+            load_fn = joblib.load
+        model = None
+        load_fn_kwargs = load_fn_kwargs or {}
+        if self.MODEL_PATH_NAME in model_data:
+            model = load_fn(model_data[self.MODEL_PATH_NAME], **load_fn_kwargs)
+        model_info_dict = yaml.safe_load(model_data[self.METADATA_PATH_NAME])
+        model_info = ModelInfo(**model_info_dict)
+        return model, model_info
diff --git a/pyproject.toml b/pyproject.toml
@@ -83,7 +83,7 @@ all = [
 
 test = ["pytest >=6.0","pytest-dotenv", "pytest-cov", "pytest-xdist", "black >=22", "ruff"]
 docs = ["mkdocs", "mike", "mdx_truly_sane_lists", "mkdocs-material >=7.1.1", "mkdocs-jupyter >=0.24.8", "mkdocstrings", "mkdocstrings-python", "markdown-include"]
-dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]"]
+dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]", "pre-commit"]
 
 [project.urls]
 Website = "https://molfeat.datamol.io"