From 20b80b1f479274611b7f8951498f1fb0eab5e152 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 11:16:25 -0500 Subject: [PATCH 1/6] Add bge-m3 embeddings. --- lilac/embeddings/bge.py | 94 ++++++++++++++++ lilac/signals/default_signals.py | 3 + notebooks/Test.ipynb | 0 poetry.lock | 101 +++++++++++++----- pyproject.toml | 4 + .../fastapi_client/models/ConceptSignal.ts | 2 +- .../models/SemanticSimilaritySignal.ts | 2 +- 7 files changed, 178 insertions(+), 28 deletions(-) create mode 100644 lilac/embeddings/bge.py create mode 100644 notebooks/Test.ipynb diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py new file mode 100644 index 000000000..0c578b1ee --- /dev/null +++ b/lilac/embeddings/bge.py @@ -0,0 +1,94 @@ +"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" +import gc +from typing import TYPE_CHECKING, ClassVar, Iterator, Optional + +from typing_extensions import override + +if TYPE_CHECKING: + from FlagEmbedding import BGEM3FlagModel + + +import functools + +from ..schema import Item +from ..signal import TextEmbeddingSignal +from ..splitters.spacy_splitter import clustering_spacy_chunker +from ..tasks import TaskExecutionType +from .embedding import chunked_compute_embedding +from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device + +# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. +BGE_M3 = 'BAAI/bge-m3' + + +@functools.cache +def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel': + try: + from FlagEmbedding import BGEM3FlagModel + except ImportError: + raise ImportError( + 'Could not import the "FlagEmbedding" python package. ' + 'Please install it with `pip install "lilac[bge]".' + ) + model = BGEM3FlagModel( + 'BAAI/bge-m3', use_fp16=True + ) # Setting use_fp16 to True speeds up computation with a slight performance degradation + return model + return setup_model_device(model, model_name) + + +class BGEM3(TextEmbeddingSignal): + """Computes BGE-M3 embeddings. + +
This embedding runs on-device. See the [model card](https://huggingface.co/BAAI/bge-m3) + for details. + """ + + name: ClassVar[str] = 'bge-m3' + display_name: ClassVar[str] = 'BGE-M3' + local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE + local_parallelism: ClassVar[int] = 1 + local_strategy: ClassVar[TaskExecutionType] = 'threads' + supports_garden: ClassVar[bool] = False + + _model_name = BGE_M3 + _model: 'BGEM3FlagModel' + + @override + def setup(self) -> None: + self._model = _get_and_cache_bge_m3(self._model_name) + + @override + def compute(self, docs: list[str]) -> list[Optional[Item]]: + """Call the embedding function.""" + + def _encode(doc): + # Extract the dense vectors from the model. + return self._model.encode(doc)['dense_vecs'] + + # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. + # The sentence transformer API actually does batching internally, so we pass + # local_batch_size * 16 to allow the library to see all the chunks at once. + return chunked_compute_embedding( + _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + ) + + @override + def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: + raise NotImplementedError('Garden computation is not supported for BGE-M3.') + + @override + def teardown(self) -> None: + if not hasattr(self, '_model'): + return + + self._model.cpu() + del self._model + gc.collect() + + try: + import torch + + torch.cuda.empty_cache() + except ImportError: + pass diff --git a/lilac/signals/default_signals.py b/lilac/signals/default_signals.py index b89988ee6..451788ecc 100644 --- a/lilac/signals/default_signals.py +++ b/lilac/signals/default_signals.py @@ -1,5 +1,6 @@ """Registers all available default signals.""" +from ..embeddings.bge import BGEM3 from ..embeddings.cohere import Cohere from ..embeddings.gte import GTEBase, GTESmall, GTETiny from ..embeddings.jina import JinaV2Base, JinaV2Small @@ -43,3 +44,5 @@ def register_default_signals() -> None: register_signal(JinaV2Small, exists_ok=True) register_signal(JinaV2Base, exists_ok=True) + + register_signal(BGEM3, exists_ok=True) diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb new file mode 100644 index 000000000..e69de29bb diff --git a/poetry.lock b/poetry.lock index 0e67239c8..f5717225b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,5 +1,35 @@ # This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand. +[[package]] +name = "accelerate" +version = "0.27.2" +description = "Accelerate" +optional = true +python-versions = ">=3.8.0" +files = [ + {file = "accelerate-0.27.2-py3-none-any.whl", hash = "sha256:a818dd27b9ba24e9eb5030d1b285cf4cdd1b41bbfa675fb4eb2477ddfc097074"}, + {file = "accelerate-0.27.2.tar.gz", hash = "sha256:cc715fe9a8bc7a286259bfb6d65fb78363badd3371e7cbda4e4a4ef34a0010aa"}, +] + +[package.dependencies] +huggingface-hub = "*" +numpy = ">=1.17" +packaging = ">=20.0" +psutil = "*" +pyyaml = "*" +safetensors = ">=0.3.1" +torch = ">=1.10.0" + +[package.extras] +dev = ["bitsandbytes", "black (>=23.1,<24.0)", "datasets", "deepspeed (<0.13.0)", "evaluate", "hf-doc-builder (>=0.3.0)", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "rich", "ruff (>=0.1.15,<0.2.0)", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +quality = ["black (>=23.1,<24.0)", "hf-doc-builder (>=0.3.0)", "ruff (>=0.1.15,<0.2.0)"] +rich = ["rich"] +sagemaker = ["sagemaker"] +test-dev = ["bitsandbytes", "datasets", "deepspeed (<0.13.0)", "evaluate", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"] +test-prod = ["parameterized", "pytest", "pytest-subtests", "pytest-xdist"] +test-trackers = ["comet-ml", "dvclive", "tensorboard", "wandb"] +testing = ["bitsandbytes", "datasets", "deepspeed (<0.13.0)", "evaluate", "parameterized", "pytest", "pytest-subtests", "pytest-xdist", "scikit-learn", "scipy", "timm", "torchpippy (>=0.2.0)", "tqdm", "transformers"] + [[package]] name = "aiohttp" version = "3.9.1" @@ -1527,6 +1557,23 @@ docs = ["furo (>=2023.9.10)", "sphinx (>=7.2.6)", "sphinx-autodoc-typehints (>=1 testing = ["covdefaults (>=2.3)", "coverage (>=7.3.2)", "diff-cover (>=8)", "pytest (>=7.4.3)", "pytest-cov (>=4.1)", "pytest-mock (>=3.12)", "pytest-timeout (>=2.2)"] typing = ["typing-extensions (>=4.8)"] +[[package]] +name = "flagembedding" +version = "1.2.3" +description = "FlagEmbedding" +optional = true +python-versions = "*" +files = [ + {file = "FlagEmbedding-1.2.3.tar.gz", hash = "sha256:f16ab1fa59969dd28e4729f0157a5225ba21a429b3adfa5cddf34c2ad0d0c3be"}, +] + +[package.dependencies] +accelerate = ">=0.20.1" +datasets = "*" +sentence_transformers = "*" +torch = ">=1.6.0" +transformers = ">=4.33.0" + [[package]] name = "floret" version = "0.10.5" @@ -4113,6 +4160,7 @@ files = [ {file = "numpy-1.26.3-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:3c67423b3703f8fbd90f5adaa37f85b5794d3366948efe9a5190a5f3a83fc34e"}, {file = "numpy-1.26.3-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:46f47ee566d98849323f01b349d58f2557f02167ee301e5e28809a8c0e27a2d0"}, {file = "numpy-1.26.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a8474703bffc65ca15853d5fd4d06b18138ae90c17c8d12169968e998e448bb5"}, + {file = "numpy-1.26.3.tar.gz", hash = "sha256:697df43e2b6310ecc9d95f05d5ef20eacc09c7c4ecc9da3f235d39e71b7da1e4"}, ] [[package]] @@ -7054,52 +7102,52 @@ test = ["argcomplete (>=3.0.3)", "mypy (>=1.7.0)", "pre-commit", "pytest (>=7.0, [[package]] name = "transformers" -version = "4.35.2" +version = "4.37.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = true python-versions = ">=3.8.0" files = [ - {file = "transformers-4.35.2-py3-none-any.whl", hash = "sha256:9dfa76f8692379544ead84d98f537be01cd1070de75c74efb13abcbc938fbe2f"}, - {file = "transformers-4.35.2.tar.gz", hash = "sha256:2d125e197d77b0cdb6c9201df9fa7e2101493272e448b9fba9341c695bee2f52"}, + {file = "transformers-4.37.2-py3-none-any.whl", hash = "sha256:595a8b12a1fcc4ad0ced49ce206c58e17be68c85d7aee3d7546d04a32c910d2e"}, + {file = "transformers-4.37.2.tar.gz", hash = "sha256:f307082ae5d528b8480611a4879a4a11651012d0e9aaea3f6cf17219ffd95542"}, ] [package.dependencies] filelock = "*" -huggingface-hub = ">=0.16.4,<1.0" +huggingface-hub = ">=0.19.3,<1.0" numpy = ">=1.17" packaging = ">=20.0" pyyaml = ">=5.1" regex = "!=2019.12.17" requests = "*" -safetensors = ">=0.3.1" +safetensors = ">=0.4.1" tokenizers = ">=0.14,<0.19" tqdm = ">=4.27" [package.extras] -accelerate = ["accelerate (>=0.20.3)"] -agents = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.10,!=1.12.0)"] -all = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] +accelerate = ["accelerate (>=0.21.0)"] +agents = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "datasets (!=2.5.0)", "diffusers", "opencv-python", "sentencepiece (>=0.1.91,!=0.1.92)", "torch (>=1.11,!=1.12.0)"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.11,!=1.12.0)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] codecarbon = ["codecarbon (==1.2.0)"] -deepspeed = ["accelerate (>=0.20.3)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "Pillow (<10.0.0)", "accelerate (>=0.20.3)", "beautifulsoup4", "black (>=23.1,<24.0)", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune]", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (>=0.0.241,<=0.0.259)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] -docs = ["Pillow (<10.0.0)", "accelerate (>=0.20.3)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune]", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "torchaudio", "torchvision"] +deepspeed = ["accelerate (>=0.21.0)", "deepspeed (>=0.9.3)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.21.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "optuna", "parameterized", "protobuf", "psutil", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "decord (==0.6.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.11,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "nltk", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.14,<0.19)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "beautifulsoup4", "codecarbon (==1.2.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "hf-doc-builder", "hf-doc-builder (>=0.3.0)", "ipadic (>=1.0.0,<2.0)", "isort (>=5.5.4)", "kenlm", "librosa", "nltk", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "parameterized", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.11,!=1.12.0)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +docs = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.21.0)", "av (==9.2.0)", "codecarbon (==1.2.0)", "decord (==0.6.0)", "flax (>=0.4.1,<=0.7.0)", "hf-doc-builder", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "kenlm", "keras-nlp (>=0.3.1)", "librosa", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm", "tokenizers (>=0.14,<0.19)", "torch (>=1.11,!=1.12.0)", "torchaudio", "torchvision"] docs-specific = ["hf-doc-builder"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -integrations = ["optuna", "ray[tune]", "sigopt"] +integrations = ["optuna", "ray[tune] (>=2.7.0)", "sigopt"] ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] modelcreation = ["cookiecutter (==1.7.3)"] -natten = ["natten (>=0.14.6)"] +natten = ["natten (>=0.14.6,<0.15.0)"] onnx = ["onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "tf2onnx"] onnxruntime = ["onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)"] optuna = ["optuna"] -quality = ["GitPython (<3.1.19)", "black (>=23.1,<24.0)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (>=0.0.241,<=0.0.259)", "urllib3 (<2.0.0)"] -ray = ["ray[tune]"] +quality = ["GitPython (<3.1.19)", "datasets (!=2.5.0)", "hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.1.5)", "urllib3 (<2.0.0)"] +ray = ["ray[tune] (>=2.7.0)"] retrieval = ["datasets (!=2.5.0)", "faiss-cpu"] sagemaker = ["sagemaker (>=2.31.0)"] sentencepiece = ["protobuf", "sentencepiece (>=0.1.91,!=0.1.92)"] @@ -7107,18 +7155,18 @@ serving = ["fastapi", "pydantic (<2)", "starlette", "uvicorn"] sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "black (>=23.1,<24.0)", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"] -tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"] -tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.15)", "tensorflow-text (<2.15)", "tf2onnx"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (!=2.5.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "hf-doc-builder (>=0.3.0)", "nltk", "parameterized", "protobuf", "psutil", "pydantic (<2)", "pytest (>=7.2.0)", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.1.5)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "tensorboard", "timeout-decorator"] +tf = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] +tf-cpu = ["keras-nlp (>=0.3.1)", "onnxconverter-common", "tensorflow-cpu (>=2.6,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] timm = ["timm"] tokenizers = ["tokenizers (>=0.14,<0.19)"] -torch = ["accelerate (>=0.20.3)", "torch (>=1.10,!=1.12.0)"] +torch = ["accelerate (>=0.21.0)", "torch (>=1.11,!=1.12.0)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -torch-vision = ["Pillow (<10.0.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.16.4,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch (>=1.10,!=1.12.0)", "tqdm (>=4.27)"] +torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] +torchhub = ["filelock", "huggingface-hub (>=0.19.3,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.14,<0.19)", "torch (>=1.11,!=1.12.0)", "tqdm (>=4.27)"] video = ["av (==9.2.0)", "decord (==0.6.0)"] -vision = ["Pillow (<10.0.0)"] +vision = ["Pillow (>=10.0.1,<=15.0)"] [[package]] name = "triton" @@ -8104,6 +8152,7 @@ testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "p [extras] all = ["cohere", "detect-secrets", "email-reply-parser", "google-api-python-client", "google-auth-httplib2", "google-auth-oauthlib", "hdbscan", "langdetect", "langsmith", "llama-hub", "llama-index", "openai", "presidio_analyzer", "sentence-transformers", "spacy", "textacy", "umap-learn"] +bge = ["FlagEmbedding", "transformers"] cohere = ["cohere"] embeddings = ["cohere", "openai", "sentence-transformers"] github = ["llama-hub", "llama-index"] @@ -8122,4 +8171,4 @@ text-stats = ["textacy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "3bba3fdb0ea5d7e2efdbaa63bae5d52245f761a8e65b13d55d679e5c0c68a4b2" +content-hash = "94b8c942172e5a02cb89c0fe8f7ea134169bdf5356d5486549bd8facb73ba8aa" diff --git a/pyproject.toml b/pyproject.toml index 991c91e94..601712517 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,8 @@ jinja2 = "^3.1.3" # Used for directory li cohere = { version = "^4.32", optional = true } openai = { version = "^1.7.1", optional = true } sentence-transformers = { version = "^2.2.2", optional = true } # SBERT on-device embeddings. +FlagEmbedding = { version = "^1.2.3", optional = true } # bge on-device embeddings. +transformers = { version = "^4.37.2", optional = true } # bge on-device embeddings. # Gmail source. email-reply-parser = { version = "^0.5.12", optional = true } @@ -86,6 +88,7 @@ hdbscan = { version = "^0.8.33", optional = true } [tool.poetry.extras] all = [ + "bge", "cohere", "detect-secrets", "email-reply-parser", @@ -131,6 +134,7 @@ text_stats = ["textacy"] # Text statistics. # Individual embeddings. gte = ["sentence-transformers"] +bge = ["FlagEmbedding", "transformers"] sbert = ["sentence-transformers"] cohere = ["cohere"] openai = ["openai"] diff --git a/web/lib/fastapi_client/models/ConceptSignal.ts b/web/lib/fastapi_client/models/ConceptSignal.ts index 4196e1b17..e67095f92 100644 --- a/web/lib/fastapi_client/models/ConceptSignal.ts +++ b/web/lib/fastapi_client/models/ConceptSignal.ts @@ -11,7 +11,7 @@ export type ConceptSignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; namespace: string; concept_name: string; version?: (number | null); diff --git a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts index e159b80fc..7adb88434 100644 --- a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts +++ b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts @@ -14,7 +14,7 @@ export type SemanticSimilaritySignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; query: string; /** * The input type of the query, used for the query embedding. From c19bafaef5edd151ee1a4d9f67f5b8c60e471d27 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 11:47:28 -0500 Subject: [PATCH 2/6] Add nomic. --- lilac/embeddings/bge.py | 18 +-- lilac/embeddings/nomic_embed.py | 108 ++++++++++++++++++ lilac/signals/default_signals.py | 3 + poetry.lock | 67 ++++------- pyproject.toml | 5 +- .../fastapi_client/models/ConceptSignal.ts | 2 +- .../models/SemanticSimilaritySignal.ts | 2 +- 7 files changed, 147 insertions(+), 58 deletions(-) create mode 100644 lilac/embeddings/nomic_embed.py diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 0c578b1ee..5ab1560f5 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -1,9 +1,11 @@ """Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" import gc -from typing import TYPE_CHECKING, ClassVar, Iterator, Optional +from typing import TYPE_CHECKING, ClassVar, Optional from typing_extensions import override +from ..utils import log + if TYPE_CHECKING: from FlagEmbedding import BGEM3FlagModel @@ -15,7 +17,7 @@ from ..splitters.spacy_splitter import clustering_spacy_chunker from ..tasks import TaskExecutionType from .embedding import chunked_compute_embedding -from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device +from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE # See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. BGE_M3 = 'BAAI/bge-m3' @@ -33,8 +35,11 @@ def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel': model = BGEM3FlagModel( 'BAAI/bge-m3', use_fp16=True ) # Setting use_fp16 to True speeds up computation with a slight performance degradation + + log(f'[{model_name}] Using device:', model.device) + + # NOTE: we don't call setup model and device here as this happens internally. return model - return setup_model_device(model, model_name) class BGEM3(TextEmbeddingSignal): @@ -62,7 +67,7 @@ def setup(self) -> None: def compute(self, docs: list[str]) -> list[Optional[Item]]: """Call the embedding function.""" - def _encode(doc): + def _encode(doc: list[str]): # Extract the dense vectors from the model. return self._model.encode(doc)['dense_vecs'] @@ -73,16 +78,11 @@ def _encode(doc): _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker ) - @override - def compute_garden(self, docs: Iterator[str]) -> Iterator[Item]: - raise NotImplementedError('Garden computation is not supported for BGE-M3.') - @override def teardown(self) -> None: if not hasattr(self, '_model'): return - self._model.cpu() del self._model gc.collect() diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py new file mode 100644 index 000000000..77c525a78 --- /dev/null +++ b/lilac/embeddings/nomic_embed.py @@ -0,0 +1,108 @@ +"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device.""" +import gc +from typing import TYPE_CHECKING, ClassVar, Optional + +from typing_extensions import override + +if TYPE_CHECKING: + from sentence_transformers import SentenceTransformer + +import functools + +from ..schema import Item +from ..signal import TextEmbeddingSignal +from ..splitters.spacy_splitter import clustering_spacy_chunker +from ..tasks import TaskExecutionType +from .embedding import chunked_compute_embedding +from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device + +# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models. +NOMIC_EMBED = 'nomic-ai/nomic-embed-text-v1.5' + + +@functools.cache +def _get_and_cache_model(model_name: str) -> 'SentenceTransformer': + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + 'Could not import the "sentence_transformers" python package. ' + 'Please install it with `pip install "sentence_transformers".' + ) + return setup_model_device(SentenceTransformer(model_name, trust_remote_code=True), model_name) + + +class NomicEmbed15(TextEmbeddingSignal): + """Computes Nomic Embeddings 1.5 full (768 dimensions). + +
This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) + for details. + """ + + name: ClassVar[str] = 'nomic-embed-1.5-768' + display_name: ClassVar[str] = 'Nomic Embeddings 1.5 784' + local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE + local_parallelism: ClassVar[int] = 1 + local_strategy: ClassVar[TaskExecutionType] = 'threads' + supports_garden: ClassVar[bool] = False + + _model_name = NOMIC_EMBED + _model: 'SentenceTransformer' + _matryoshka_dim = 768 + + @override + def setup(self) -> None: + self._model = _get_and_cache_model(self._model_name) + + @override + def compute(self, docs: list[str]) -> list[Optional[Item]]: + """Call the embedding function.""" + try: + import torch.nn.functional as F + except ImportError: + raise ImportError( + 'Could not import the "sentence_transformers" python package. ' + 'Please install it with `pip install "sentence_transformers".' + ) + + def _encode(doc: list[str]): + embeddings = self._model.encode(doc, convert_to_tensor=True) + # Extract the dense vectors from the model. + embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) + embeddings = embeddings[:, : self._matryoshka_dim] + return embeddings.cpu().numpy() + + # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. + # The sentence transformer API actually does batching internally, so we pass + # local_batch_size * 16 to allow the library to see all the chunks at once. + return chunked_compute_embedding( + _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + ) + + @override + def teardown(self) -> None: + if not hasattr(self, '_model'): + return + + self._model.cpu() + del self._model + gc.collect() + + try: + import torch + + torch.cuda.empty_cache() + except ImportError: + pass + + +class NomicEmbed15_256(NomicEmbed15): + """Computes Nomic Embeddings 1.5 (256 dimensions). + +
This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5) + for details. + """ + + name: ClassVar[str] = 'nomic-embed-1.5-256' + display_name: ClassVar[str] = 'Nomic Embeddings 1.5 256' + _matryoshka_dim = 256 diff --git a/lilac/signals/default_signals.py b/lilac/signals/default_signals.py index 451788ecc..b85c71938 100644 --- a/lilac/signals/default_signals.py +++ b/lilac/signals/default_signals.py @@ -4,6 +4,7 @@ from ..embeddings.cohere import Cohere from ..embeddings.gte import GTEBase, GTESmall, GTETiny from ..embeddings.jina import JinaV2Base, JinaV2Small +from ..embeddings.nomic_embed import NomicEmbed15, NomicEmbed15_256 from ..embeddings.openai import OpenAIEmbedding from ..embeddings.sbert import SBERT from ..signal import register_signal @@ -46,3 +47,5 @@ def register_default_signals() -> None: register_signal(JinaV2Base, exists_ok=True) register_signal(BGEM3, exists_ok=True) + register_signal(NomicEmbed15, exists_ok=True) + register_signal(NomicEmbed15_256, exists_ok=True) diff --git a/poetry.lock b/poetry.lock index f5717225b..df5a20918 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1421,6 +1421,17 @@ files = [ {file = "duckdb-0.9.2.tar.gz", hash = "sha256:3843afeab7c3fc4a4c0b53686a4cc1d9cdbdadcbb468d60fef910355ecafd447"}, ] +[[package]] +name = "einops" +version = "0.7.0" +description = "A new flavour of deep learning operations" +optional = true +python-versions = ">=3.8" +files = [ + {file = "einops-0.7.0-py3-none-any.whl", hash = "sha256:0f3096f26b914f465f6ff3c66f5478f9a5e380bb367ffc6493a68143fbbf1fd1"}, + {file = "einops-0.7.0.tar.gz", hash = "sha256:b2b04ad6081a3b227080c9bf5e3ace7160357ff03043cd66cc5b2319eb7031d1"}, +] + [[package]] name = "email-reply-parser" version = "0.5.12" @@ -6056,25 +6067,26 @@ win32 = ["pywin32"] [[package]] name = "sentence-transformers" -version = "2.2.2" +version = "2.3.1" description = "Multilingual text embeddings" optional = true -python-versions = ">=3.6.0" +python-versions = ">=3.8.0" files = [ - {file = "sentence-transformers-2.2.2.tar.gz", hash = "sha256:dbc60163b27de21076c9a30d24b5b7b6fa05141d68cf2553fa9a77bf79a29136"}, + {file = "sentence-transformers-2.3.1.tar.gz", hash = "sha256:d589d85a464f45338cdbdf99ea715f8068e1fb01c582e0bcdbf60bcf3eade6d0"}, + {file = "sentence_transformers-2.3.1-py3-none-any.whl", hash = "sha256:285d6637726c3b002186aa4b8bcace1101364b32671fb605297c4c2636b8190e"}, ] [package.dependencies] -huggingface-hub = ">=0.4.0" +huggingface-hub = ">=0.15.1" nltk = "*" numpy = "*" +Pillow = "*" scikit-learn = "*" scipy = "*" sentencepiece = "*" -torch = ">=1.6.0" -torchvision = "*" +torch = ">=1.11.0" tqdm = "*" -transformers = ">=4.6.0,<5.0.0" +transformers = ">=4.32.0,<5.0.0" [[package]] name = "sentencepiece" @@ -7007,44 +7019,6 @@ typing-extensions = "*" dynamo = ["jinja2"] opt-einsum = ["opt-einsum (>=3.3)"] -[[package]] -name = "torchvision" -version = "0.16.2" -description = "image and video datasets and models for torch deep learning" -optional = true -python-versions = ">=3.8" -files = [ - {file = "torchvision-0.16.2-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:bc86f2800cb2c0c1a09c581409cdd6bff66e62f103dc83fc63f73346264c3756"}, - {file = "torchvision-0.16.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:b024bd412df6d3a007dcebf311a894eb3c5c21e1af80d12be382bbcb097a7c3a"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux1_x86_64.whl", hash = "sha256:e89f10f3c8351972b6e3fda95bc3e479ea8dbfc9dfcfd2c32902dbad4ba5cfc5"}, - {file = "torchvision-0.16.2-cp310-cp310-manylinux2014_aarch64.whl", hash = "sha256:96c7583700112a410bdc4e1e4f118c429dab49c29c9a31a2cc3579bc9b08b19d"}, - {file = "torchvision-0.16.2-cp310-cp310-win_amd64.whl", hash = "sha256:9f4032ebb3277fb07ff6a9b818d50a547fb8fcd89d958cfd9e773322454bb688"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:67b1aaf8b8cb02ce75dd445f291a27c8036a502f8c0aa76e28c37a0faac2e153"}, - {file = "torchvision-0.16.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:bef30d03e1d1c629761f4dca51d3b7d8a0dc0acce6f4068ab2a1634e8e7b64e0"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux1_x86_64.whl", hash = "sha256:e59cc7b2bd1ab5c0ce4ae382e4e37be8f1c174e8b5de2f6a23c170de9ae28495"}, - {file = "torchvision-0.16.2-cp311-cp311-manylinux2014_aarch64.whl", hash = "sha256:e130b08cc9b3cc73a6c59d6edf032394a322f9579bfd21d14bc2e1d0999aa758"}, - {file = "torchvision-0.16.2-cp311-cp311-win_amd64.whl", hash = "sha256:8692ab1e48807e9604046a6f4beeb67b523294cee1b00828654bb0df2cfce2b2"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_10_13_x86_64.whl", hash = "sha256:b82732dcf876a37c852772342aa6ee3480c03bb3e2a802ae109fc5f7e28d26e9"}, - {file = "torchvision-0.16.2-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4b065143d1a720fe8a9077fd4be35d491f98819ec80b3dbbc3ec64d0b707a906"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux1_x86_64.whl", hash = "sha256:bc5f274e4ecd1b86062063cdf4fd385a1d39d147a3a2685fbbde9ff08bb720b8"}, - {file = "torchvision-0.16.2-cp38-cp38-manylinux2014_aarch64.whl", hash = "sha256:335959c43b371c0474af34c1ef2a52efdc7603c45700d29e4475eeb02984170c"}, - {file = "torchvision-0.16.2-cp38-cp38-win_amd64.whl", hash = "sha256:7fd22d86e08eba321af70cad291020c2cdeac069b00ce88b923ca52e06174769"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_10_13_x86_64.whl", hash = "sha256:56115268b37f0b75364e3654e47ad9abc66ac34c1f9e5e3dfa89a22d6a40017a"}, - {file = "torchvision-0.16.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:82805f8445b094f9d1e770390ee6cc86855e89955e08ce34af2e2274fc0e5c45"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux1_x86_64.whl", hash = "sha256:3f4bd5fcbc361476e2e78016636ac7d5509e59d9962521f06eb98e6803898182"}, - {file = "torchvision-0.16.2-cp39-cp39-manylinux2014_aarch64.whl", hash = "sha256:8199acdf8ab066a28b84a5b6f4d97b58976d9e164b1acc3a9d14fccfaf74bb3a"}, - {file = "torchvision-0.16.2-cp39-cp39-win_amd64.whl", hash = "sha256:41dd4fa9f176d563fe9f1b9adef3b7e582cdfb60ce8c9bc51b094a025be687c9"}, -] - -[package.dependencies] -numpy = "*" -pillow = ">=5.3.0,<8.3.dev0 || >=8.4.dev0" -requests = "*" -torch = "2.1.2" - -[package.extras] -scipy = ["scipy"] - [[package]] name = "tornado" version = "6.4" @@ -8161,6 +8135,7 @@ gte = ["sentence-transformers"] lang-detection = ["langdetect"] langsmith = ["langsmith"] llms = ["openai"] +nomic = ["einops", "sentence-transformers"] openai = ["openai"] pii = ["detect-secrets", "presidio_analyzer"] sbert = ["sentence-transformers"] @@ -8171,4 +8146,4 @@ text-stats = ["textacy"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<4.0" -content-hash = "94b8c942172e5a02cb89c0fe8f7ea134169bdf5356d5486549bd8facb73ba8aa" +content-hash = "5aec3cf990d020b4c1c66a1ebce20b00778021db680a4c4dd8fdf65aa9fb4295" diff --git a/pyproject.toml b/pyproject.toml index 601712517..ec1927fda 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -52,9 +52,10 @@ jinja2 = "^3.1.3" # Used for directory li # LLM providers. cohere = { version = "^4.32", optional = true } openai = { version = "^1.7.1", optional = true } -sentence-transformers = { version = "^2.2.2", optional = true } # SBERT on-device embeddings. +sentence-transformers = { version = "^2.3.1", optional = true } # SBERT on-device embeddings. FlagEmbedding = { version = "^1.2.3", optional = true } # bge on-device embeddings. transformers = { version = "^4.37.2", optional = true } # bge on-device embeddings. +einops = { version = "^0.7.0", optional = true } # Nomic on-device embeddings. # Gmail source. email-reply-parser = { version = "^0.5.12", optional = true } @@ -100,6 +101,7 @@ all = [ "langsmith", "llama-hub", "llama-index", + "nomic", "openai", "presidio_analyzer", "sentence-transformers", @@ -135,6 +137,7 @@ text_stats = ["textacy"] # Text statistics. # Individual embeddings. gte = ["sentence-transformers"] bge = ["FlagEmbedding", "transformers"] +nomic = ["sentence-transformers", "einops"] sbert = ["sentence-transformers"] cohere = ["cohere"] openai = ["openai"] diff --git a/web/lib/fastapi_client/models/ConceptSignal.ts b/web/lib/fastapi_client/models/ConceptSignal.ts index e67095f92..19ea8906f 100644 --- a/web/lib/fastapi_client/models/ConceptSignal.ts +++ b/web/lib/fastapi_client/models/ConceptSignal.ts @@ -11,7 +11,7 @@ export type ConceptSignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256'; namespace: string; concept_name: string; version?: (number | null); diff --git a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts index 7adb88434..ec964d367 100644 --- a/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts +++ b/web/lib/fastapi_client/models/SemanticSimilaritySignal.ts @@ -14,7 +14,7 @@ export type SemanticSimilaritySignal = { /** * The name of the pre-computed embedding. */ - embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3'; + embedding: 'cohere' | 'sbert' | 'openai' | 'gte-tiny' | 'gte-small' | 'gte-base' | 'jina-v2-small' | 'jina-v2-base' | 'bge-m3' | 'nomic-embed-1.5-768' | 'nomic-embed-1.5-256'; query: string; /** * The input type of the query, used for the query embedding. From fe03f3f4e80fccb5907d24e33d7a7171801bb049 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 11:48:28 -0500 Subject: [PATCH 3/6] save --- notebooks/Test.ipynb | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 notebooks/Test.ipynb diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb deleted file mode 100644 index e69de29bb..000000000 From 1a4c75a86e6e98b820bdea51522b59a9d6884890 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 11:57:39 -0500 Subject: [PATCH 4/6] save --- lilac/data/cluster_titling.py | 2 +- lilac/embeddings/bge.py | 3 ++- lilac/embeddings/nomic_embed.py | 3 ++- mypy.ini | 5 +++++ 4 files changed, 10 insertions(+), 3 deletions(-) diff --git a/lilac/data/cluster_titling.py b/lilac/data/cluster_titling.py index 805735d56..7161c9c44 100644 --- a/lilac/data/cluster_titling.py +++ b/lilac/data/cluster_titling.py @@ -14,6 +14,7 @@ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential from ..batch_utils import group_by_sorted_key_iter +from ..env import env from ..schema import ( Item, ) @@ -24,7 +25,6 @@ ) from ..tasks import TaskInfo from ..utils import chunks, log -from ..env import env _TOP_K_CENTRAL_DOCS = 7 _TOP_K_CENTRAL_TITLES = 20 diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index 5ab1560f5..e8edf2f79 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -2,6 +2,7 @@ import gc from typing import TYPE_CHECKING, ClassVar, Optional +import numpy as np from typing_extensions import override from ..utils import log @@ -67,7 +68,7 @@ def setup(self) -> None: def compute(self, docs: list[str]) -> list[Optional[Item]]: """Call the embedding function.""" - def _encode(doc: list[str]): + def _encode(doc: list[str]) -> list[np.ndarray]: # Extract the dense vectors from the model. return self._model.encode(doc)['dense_vecs'] diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py index 77c525a78..24f703eed 100644 --- a/lilac/embeddings/nomic_embed.py +++ b/lilac/embeddings/nomic_embed.py @@ -2,6 +2,7 @@ import gc from typing import TYPE_CHECKING, ClassVar, Optional +import numpy as np from typing_extensions import override if TYPE_CHECKING: @@ -65,7 +66,7 @@ def compute(self, docs: list[str]) -> list[Optional[Item]]: 'Please install it with `pip install "sentence_transformers".' ) - def _encode(doc: list[str]): + def _encode(doc: list[str]) -> list[np.ndarray]: embeddings = self._model.encode(doc, convert_to_tensor=True) # Extract the dense vectors from the model. embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],)) diff --git a/mypy.ini b/mypy.ini index e485c8758..840964cad 100644 --- a/mypy.ini +++ b/mypy.ini @@ -159,3 +159,8 @@ follow_imports = skip [mypy-hdbscan.*] ignore_missing_imports = True follow_imports = skip + + +[mypy-FlagEmbedding.*] +ignore_missing_imports = True +follow_imports = skip From 8f6c324eb93af4b5972de4d0763ed1d70ae2e4af Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 11:59:16 -0500 Subject: [PATCH 5/6] save --- lilac/embeddings/bge.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py index e8edf2f79..0ba749a24 100644 --- a/lilac/embeddings/bge.py +++ b/lilac/embeddings/bge.py @@ -2,7 +2,6 @@ import gc from typing import TYPE_CHECKING, ClassVar, Optional -import numpy as np from typing_extensions import override from ..utils import log @@ -67,16 +66,14 @@ def setup(self) -> None: @override def compute(self, docs: list[str]) -> list[Optional[Item]]: """Call the embedding function.""" - - def _encode(doc: list[str]) -> list[np.ndarray]: - # Extract the dense vectors from the model. - return self._model.encode(doc)['dense_vecs'] - # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10. # The sentence transformer API actually does batching internally, so we pass # local_batch_size * 16 to allow the library to see all the chunks at once. return chunked_compute_embedding( - _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker + lambda docs: self._model.encode(docs)['dense_vecs'], + docs, + self.local_batch_size * 16, + chunker=clustering_spacy_chunker, ) @override From cbcfe8664f44d22b368e3bb8e937c0ddb4fbaf61 Mon Sep 17 00:00:00 2001 From: Nikhil Thorat Date: Wed, 14 Feb 2024 12:21:37 -0500 Subject: [PATCH 6/6] save --- pyproject.toml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ec1927fda..a15ca1d43 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -89,10 +89,10 @@ hdbscan = { version = "^0.8.33", optional = true } [tool.poetry.extras] all = [ - "bge", "cohere", "detect-secrets", "email-reply-parser", + "FlagEmbedding", "google-api-python-client", "google-auth-httplib2", "google-auth-oauthlib", @@ -107,6 +107,7 @@ all = [ "sentence-transformers", "spacy", "textacy", + "transformers", "umap-learn", ]