Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add bge-m3 and Nomic 1.5 embeddings. #1182

Merged
merged 6 commits into from
Feb 14, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lilac/data/cluster_titling.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential

from ..batch_utils import group_by_sorted_key_iter
from ..env import env
from ..schema import (
Item,
)
Expand All @@ -24,7 +25,6 @@
)
from ..tasks import TaskInfo
from ..utils import chunks, log
from ..env import env

_TOP_K_CENTRAL_DOCS = 7
_TOP_K_CENTRAL_TITLES = 20
Expand Down
92 changes: 92 additions & 0 deletions lilac/embeddings/bge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
import gc
from typing import TYPE_CHECKING, ClassVar, Optional

from typing_extensions import override

from ..utils import log

if TYPE_CHECKING:
from FlagEmbedding import BGEM3FlagModel


import functools

from ..schema import Item
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
BGE_M3 = 'BAAI/bge-m3'


@functools.cache
def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel':
try:
from FlagEmbedding import BGEM3FlagModel
except ImportError:
raise ImportError(
'Could not import the "FlagEmbedding" python package. '
'Please install it with `pip install "lilac[bge]".'
)
model = BGEM3FlagModel(
'BAAI/bge-m3', use_fp16=True
) # Setting use_fp16 to True speeds up computation with a slight performance degradation

log(f'[{model_name}] Using device:', model.device)

# NOTE: we don't call setup model and device here as this happens internally.
return model


class BGEM3(TextEmbeddingSignal):
"""Computes BGE-M3 embeddings.

<br>This embedding runs on-device. See the [model card](https://huggingface.co/BAAI/bge-m3)
for details.
"""

name: ClassVar[str] = 'bge-m3'
display_name: ClassVar[str] = 'BGE-M3'
local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE
local_parallelism: ClassVar[int] = 1
local_strategy: ClassVar[TaskExecutionType] = 'threads'
supports_garden: ClassVar[bool] = False

_model_name = BGE_M3
_model: 'BGEM3FlagModel'

@override
def setup(self) -> None:
self._model = _get_and_cache_bge_m3(self._model_name)

@override
def compute(self, docs: list[str]) -> list[Optional[Item]]:
"""Call the embedding function."""
# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
return chunked_compute_embedding(
lambda docs: self._model.encode(docs)['dense_vecs'],
docs,
self.local_batch_size * 16,
chunker=clustering_spacy_chunker,
)

@override
def teardown(self) -> None:
if not hasattr(self, '_model'):
return

del self._model
gc.collect()

try:
import torch

torch.cuda.empty_cache()
except ImportError:
pass
109 changes: 109 additions & 0 deletions lilac/embeddings/nomic_embed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
import gc
from typing import TYPE_CHECKING, ClassVar, Optional

import numpy as np
from typing_extensions import override

if TYPE_CHECKING:
from sentence_transformers import SentenceTransformer

import functools

from ..schema import Item
from ..signal import TextEmbeddingSignal
from ..splitters.spacy_splitter import clustering_spacy_chunker
from ..tasks import TaskExecutionType
from .embedding import chunked_compute_embedding
from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device

# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
NOMIC_EMBED = 'nomic-ai/nomic-embed-text-v1.5'


@functools.cache
def _get_and_cache_model(model_name: str) -> 'SentenceTransformer':
try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
'Could not import the "sentence_transformers" python package. '
'Please install it with `pip install "sentence_transformers".'
)
return setup_model_device(SentenceTransformer(model_name, trust_remote_code=True), model_name)


class NomicEmbed15(TextEmbeddingSignal):
"""Computes Nomic Embeddings 1.5 full (768 dimensions).

<br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
for details.
"""

name: ClassVar[str] = 'nomic-embed-1.5-768'
display_name: ClassVar[str] = 'Nomic Embeddings 1.5 784'
local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE
local_parallelism: ClassVar[int] = 1
local_strategy: ClassVar[TaskExecutionType] = 'threads'
supports_garden: ClassVar[bool] = False

_model_name = NOMIC_EMBED
_model: 'SentenceTransformer'
_matryoshka_dim = 768

@override
def setup(self) -> None:
self._model = _get_and_cache_model(self._model_name)

@override
def compute(self, docs: list[str]) -> list[Optional[Item]]:
"""Call the embedding function."""
try:
import torch.nn.functional as F
except ImportError:
raise ImportError(
'Could not import the "sentence_transformers" python package. '
'Please install it with `pip install "sentence_transformers".'
)

def _encode(doc: list[str]) -> list[np.ndarray]:
embeddings = self._model.encode(doc, convert_to_tensor=True)
# Extract the dense vectors from the model.
embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
embeddings = embeddings[:, : self._matryoshka_dim]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

are dimensions sorted by importance , like PCA ? curious why we can just take the last 256/512 dimensions

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thats the magic of matryoshka_dim.. it's not PCA exactly, afaict it's baked into the loss function: https://aniketrege.github.io/blog/2024/mrl/

return embeddings.cpu().numpy()

# While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
# The sentence transformer API actually does batching internally, so we pass
# local_batch_size * 16 to allow the library to see all the chunks at once.
return chunked_compute_embedding(
_encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
)

@override
def teardown(self) -> None:
if not hasattr(self, '_model'):
return

self._model.cpu()
del self._model
gc.collect()

try:
import torch

torch.cuda.empty_cache()
except ImportError:
pass


class NomicEmbed15_256(NomicEmbed15):
"""Computes Nomic Embeddings 1.5 (256 dimensions).

<br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
for details.
"""

name: ClassVar[str] = 'nomic-embed-1.5-256'
display_name: ClassVar[str] = 'Nomic Embeddings 1.5 256'
_matryoshka_dim = 256
6 changes: 6 additions & 0 deletions lilac/signals/default_signals.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Registers all available default signals."""

from ..embeddings.bge import BGEM3
from ..embeddings.cohere import Cohere
from ..embeddings.gte import GTEBase, GTESmall, GTETiny
from ..embeddings.jina import JinaV2Base, JinaV2Small
from ..embeddings.nomic_embed import NomicEmbed15, NomicEmbed15_256
from ..embeddings.openai import OpenAIEmbedding
from ..embeddings.sbert import SBERT
from ..signal import register_signal
Expand Down Expand Up @@ -43,3 +45,7 @@ def register_default_signals() -> None:

register_signal(JinaV2Small, exists_ok=True)
register_signal(JinaV2Base, exists_ok=True)

register_signal(BGEM3, exists_ok=True)
register_signal(NomicEmbed15, exists_ok=True)
register_signal(NomicEmbed15_256, exists_ok=True)
5 changes: 5 additions & 0 deletions mypy.ini
Original file line number Diff line number Diff line change
Expand Up @@ -159,3 +159,8 @@ follow_imports = skip
[mypy-hdbscan.*]
ignore_missing_imports = True
follow_imports = skip


[mypy-FlagEmbedding.*]
ignore_missing_imports = True
follow_imports = skip
Loading
Loading