Skip to content

Commit

Permalink
Merge pull request #102 from kkovary/101-in-memory-model-loading
Browse files Browse the repository at this point in the history
101 in memory model loading
  • Loading branch information
maclandrol authored Aug 15, 2024
2 parents 9f6062b + 0a62bc5 commit 40414ae
Show file tree
Hide file tree
Showing 6 changed files with 178 additions and 14 deletions.
22 changes: 11 additions & 11 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
repos:
- repo: https://github.com/psf/black
rev: "24.8.0" # run `pre-commit autoupdate`
hooks:
- id: black
- repo: https://github.com/psf/black
rev: "24.8.0" # run `pre-commit autoupdate`
hooks:
- id: black
name: black
entry: black
args:
args:
- --check
language: system
types: [python]

- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.5.7"
hooks:
- id: ruff
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: "v0.5.7"
hooks:
- id: ruff
name: ruff
entry: ruff
args: ["check"]
args: ["check"]
language: system
types: [python]
types: [python]
88 changes: 88 additions & 0 deletions docs/tutorials/loading_models_without_disk_storage.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Loading Models Without Disk Storage Using `InMemoryModelStore`\n",
"\n",
"## Introduction\n",
"This tutorial introduces the `InMemoryModelStore` class, an alternative to `ModelStore` designed for environments with limited disk space. This approach is particularly beneficial when memory is more readily available than disk space or disk access is limited or slow.\n",
"\n",
"`InMemoryModelStore` enables model loading directly into memory, bypassing the need for local storage and disk I/O operations.\n",
"\n",
"## Using InMemoryModelStore\n",
"Here's how to use `InMemoryModelStore`:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from molfeat.store import InMemoryModelStore\n",
"\n",
"# Initialize the in-memory store\n",
"store = InMemoryModelStore(model_store_bucket='s3://my-modelstore-bucket')\n",
"\n",
"# Load a model directly into memory\n",
"model, model_info = store.load('My-Model')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The model is now ready for use without any disk I/O overhead"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import datamol as dm\n",
"from molfeat.trans.pretrained.hf_transformers import PretrainedHFTransformer\n",
"\n",
"smiles = dm.freesolv().iloc[:100].smiles\n",
"\n",
"transformer = PretrainedHFTransformer(kind=model, notation=model_info.inputs, dtype=float)\n",
"features = transformer(smiles)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Benefits and Considerations\n",
"- **Reduced Latency**: By loading models directly into memory, you eliminate the time needed for disk reads.\n",
"- **Efficient Resource Use**: Ideal for serverless environments where disk access might be limited or costly.\n",
"\n",
"However, keep in mind that this approach requires sufficient memory to hold the entire model. Ensure your deployment environment has adequate RAM for your model size."
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
4 changes: 3 additions & 1 deletion mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ nav:
- Finetuning a pretrained transformer: tutorials/transformer_finetuning.ipynb
- Save and Load: tutorials/save_and_load.ipynb
- The Data Cache: tutorials/datacache.ipynb
- Model Handling:
- Loading Models Without Disk Storage: tutorials/loading_models_without_disk_storage.ipynb

- Developers:
- Contributing: developers/contribute.md
Expand Down Expand Up @@ -97,7 +99,7 @@ watch:

plugins:
- search

- mkdocstrings:
handlers:
python:
Expand Down
1 change: 1 addition & 0 deletions molfeat/store/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .modelcard import ModelInfo
from .modelstore import ModelStore
from .modelstore import InMemoryModelStore
from .modelstore import ModelStoreError
75 changes: 74 additions & 1 deletion molfeat/store/modelstore.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import os
import pathlib
import tempfile
from typing import Any, Callable, Optional, Union
from typing import Any, Callable, Dict, Optional, Union
import io

import datamol as dm
import filelock
Expand Down Expand Up @@ -333,3 +334,75 @@ def search(self, modelcard: Optional[ModelInfo] = None, **search_kwargs):
if model.match(search_infos, match_only=list(search_infos.keys())):
found.append(model)
return found


class InMemoryModelStore(ModelStore):
"""A class for loading models directly into memory from ModelStore"""

def download(
self,
modelcard: ModelInfo,
) -> Dict[str, io.BytesIO]:
"""Download an artifact into memory
Args:
modelcard: information on the model to download
Returns:
A dictionary with file names as keys and file-like objects as values.
"""
remote_dir = modelcard.path(self.model_store_bucket)
model_name = modelcard.name
if not self.exists(modelcard, check_remote=True):
raise ModelStoreError(f"Model {model_name} does not exist in the model store!")

model_remote_path = dm.fs.join(remote_dir, self.MODEL_PATH_NAME)
metadata_remote_path = dm.fs.join(remote_dir, self.METADATA_PATH_NAME)

model_data = {}

# Download metadata
with fsspec.open(metadata_remote_path, "rb") as IN:
model_data[self.METADATA_PATH_NAME] = io.BytesIO(IN.read())

# Download model
with fsspec.open(model_remote_path, "rb") as IN:
model_data[self.MODEL_PATH_NAME] = io.BytesIO(IN.read())

return model_data

def load(
self,
model_name: Union[str, dict, ModelInfo],
load_fn: Optional[Callable] = None,
load_fn_kwargs: Optional[dict] = None,
):
"""
Load a model by its name
Args:
model_name: name of the model to load
load_fn: Custom loading function to load the model
load_fn_kwargs: Optional dict of additional kwargs to provide to the loading function
Returns:
model: Optional model, if the model requires download or loading weights
model_info: model information card
"""
if isinstance(model_name, str):
modelcard = self.search(name=model_name)[0]
else:
modelcard = model_name

model_data = self.download(
modelcard=modelcard,
)
if load_fn is None:
load_fn = joblib.load
model = None
load_fn_kwargs = load_fn_kwargs or {}
if self.MODEL_PATH_NAME in model_data:
model = load_fn(model_data[self.MODEL_PATH_NAME], **load_fn_kwargs)
model_info_dict = yaml.safe_load(model_data[self.METADATA_PATH_NAME])
model_info = ModelInfo(**model_info_dict)
return model, model_info
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ all = [

test = ["pytest >=6.0","pytest-dotenv", "pytest-cov", "pytest-xdist", "black >=22", "ruff"]
docs = ["mkdocs", "mike", "mdx_truly_sane_lists", "mkdocs-material >=7.1.1", "mkdocs-jupyter >=0.24.8", "mkdocstrings", "mkdocstrings-python", "markdown-include"]
dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]"]
dev = ["molfeat[test]", "molfeat[all]", "molfeat[docs]", "pre-commit"]

[project.urls]
Website = "https://molfeat.datamol.io"
Expand Down

0 comments on commit 40414ae

Please sign in to comment.