diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index cc08b2d..05bbf1e 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,21 +1,21 @@ repos: -- repo: https://github.com/psf/black - rev: "24.8.0" # run `pre-commit autoupdate` - hooks: - - id: black + - repo: https://github.com/psf/black + rev: "24.8.0" # run `pre-commit autoupdate` + hooks: + - id: black name: black entry: black - args: + args: - --check language: system types: [python] -- repo: https://github.com/astral-sh/ruff-pre-commit - rev: "v0.5.7" - hooks: - - id: ruff + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: "v0.5.7" + hooks: + - id: ruff name: ruff entry: ruff - args: ["check"] + args: ["check"] language: system - types: [python] \ No newline at end of file + types: [python] diff --git a/docs/tutorials/loading_models_without_disk_storage.ipynb b/docs/tutorials/loading_models_without_disk_storage.ipynb new file mode 100644 index 0000000..0e4fa14 --- /dev/null +++ b/docs/tutorials/loading_models_without_disk_storage.ipynb @@ -0,0 +1,88 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Loading Models Without Disk Storage Using `InMemoryModelStore`\n", + "\n", + "## Introduction\n", + "This tutorial introduces the `InMemoryModelStore` class, an alternative to `ModelStore` designed for environments with limited disk space. This approach is particularly beneficial when memory is more readily available than disk space or disk access is limited or slow.\n", + "\n", + "`InMemoryModelStore` enables model loading directly into memory, bypassing the need for local storage and disk I/O operations.\n", + "\n", + "## Using InMemoryModelStore\n", + "Here's how to use `InMemoryModelStore`:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from molfeat.store import InMemoryModelStore\n", + "\n", + "# Initialize the in-memory store\n", + "store = InMemoryModelStore(model_store_bucket='s3://my-modelstore-bucket')\n", + "\n", + "# Load a model directly into memory\n", + "model, model_info = store.load('My-Model')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The model is now ready for use without any disk I/O overhead" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import datamol as dm\n", + "from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer\n", + "\n", + "smiles = dm.freesolv().iloc[:100].smiles\n", + "\n", + "transformer = PretrainedHFTransformer(kind=model, notation=model_info.inputs, dtype=float)\n", + "features = transformer(smiles)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Benefits and Considerations\n", + "- **Reduced Latency**: By loading models directly into memory, you eliminate the time needed for disk reads.\n", + "- **Efficient Resource Use**: Ideal for serverless environments where disk access might be limited or costly.\n", + "\n", + "However, keep in mind that this approach requires sufficient memory to hold the entire model. Ensure your deployment environment has adequate RAM for your model size." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/mkdocs.yml b/mkdocs.yml index c828af7..db0fce8 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -27,6 +27,8 @@ nav: - Finetuning a pretrained transformer: tutorials/transformer_finetuning.ipynb - Save and Load: tutorials/save_and_load.ipynb - The Data Cache: tutorials/datacache.ipynb + - Model Handling: + - Loading Models Without Disk Storage: tutorials/loading_models_without_disk_storage.ipynb - Developers: - Contributing: developers/contribute.md @@ -97,7 +99,7 @@ watch: plugins: - search - + - mkdocstrings: handlers: python: diff --git a/molfeat/store/__init__.py b/molfeat/store/__init__.py index f77e76a..3969f99 100644 --- a/molfeat/store/__init__.py +++ b/molfeat/store/__init__.py @@ -1,3 +1,4 @@ from .modelcard import ModelInfo from .modelstore import ModelStore +from .modelstore import InMemoryModelStore from .modelstore import ModelStoreError diff --git a/molfeat/store/modelstore.py b/molfeat/store/modelstore.py index aba0830..dd23805 100644 --- a/molfeat/store/modelstore.py +++ b/molfeat/store/modelstore.py @@ -1,7 +1,8 @@ import os import pathlib import tempfile -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, Dict, Optional, Union +import io import datamol as dm import filelock @@ -333,3 +334,75 @@ def search(self, modelcard: Optional[ModelInfo] = None, **search_kwargs): if model.match(search_infos, match_only=list(search_infos.keys())): found.append(model) return found + + +class InMemoryModelStore(ModelStore): + """A class for loading models directly into memory from ModelStore""" + + def download( + self, + modelcard: ModelInfo, + ) -> Dict[str, io.BytesIO]: + """Download an artifact into memory + + Args: + modelcard: information on the model to download + + Returns: + A dictionary with file names as keys and file-like objects as values. + """ + remote_dir = modelcard.path(self.model_store_bucket) + model_name = modelcard.name + if not self.exists(modelcard, check_remote=True): + raise ModelStoreError(f"Model {model_name} does not exist in the model store!") + + model_remote_path = dm.fs.join(remote_dir, self.MODEL_PATH_NAME) + metadata_remote_path = dm.fs.join(remote_dir, self.METADATA_PATH_NAME) + + model_data = {} + + # Download metadata + with fsspec.open(metadata_remote_path, "rb") as IN: + model_data[self.METADATA_PATH_NAME] = io.BytesIO(IN.read()) + + # Download model + with fsspec.open(model_remote_path, "rb") as IN: + model_data[self.MODEL_PATH_NAME] = io.BytesIO(IN.read()) + + return model_data + + def load( + self, + model_name: Union[str, dict, ModelInfo], + load_fn: Optional[Callable] = None, + load_fn_kwargs: Optional[dict] = None, + ): + """ + Load a model by its name + + Args: + model_name: name of the model to load + load_fn: Custom loading function to load the model + load_fn_kwargs: Optional dict of additional kwargs to provide to the loading function + + Returns: + model: Optional model, if the model requires download or loading weights + model_info: model information card + """ + if isinstance(model_name, str): + modelcard = self.search(name=model_name)[0] + else: + modelcard = model_name + + model_data = self.download( + modelcard=modelcard, + ) + if load_fn is None: + load_fn = joblib.load + model = None + load_fn_kwargs = load_fn_kwargs or {} + if self.MODEL_PATH_NAME in model_data: + model = load_fn(model_data[self.MODEL_PATH_NAME], **load_fn_kwargs) + model_info_dict = yaml.safe_load(model_data[self.METADATA_PATH_NAME]) + model_info = ModelInfo(**model_info_dict) + return model, model_info diff --git a/pyproject.toml b/pyproject.toml index 764cf13..60bda22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -83,7 +83,7 @@ all = [ test = ["pytest >=6.0","pytest-dotenv", "pytest-cov", "pytest-xdist", "black >=22", "ruff"] docs = ["mkdocs", "mike", "mdx_truly_sane_lists", "mkdocs-material >=7.1.1", "mkdocs-jupyter >=0.24.8", "mkdocstrings", "mkdocstrings-python", "markdown-include"] -dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]"] +dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]", "pre-commit"] [project.urls] Website = "https://molfeat.datamol.io"