databricks · nsthorat · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024 · Feb 14, 2024
diff --git a/lilac/data/cluster_titling.py b/lilac/data/cluster_titling.py
@@ -14,6 +14,7 @@
 from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
 
 from ..batch_utils import group_by_sorted_key_iter
+from ..env import env
 from ..schema import (
   Item,
 )
@@ -24,7 +25,6 @@
 )
 from ..tasks import TaskInfo
 from ..utils import chunks, log
-from ..env import env
 
 _TOP_K_CENTRAL_DOCS = 7
 _TOP_K_CENTRAL_TITLES = 20

diff --git a/lilac/embeddings/bge.py b/lilac/embeddings/bge.py
@@ -0,0 +1,92 @@
+"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
+import gc
+from typing import TYPE_CHECKING, ClassVar, Optional
+
+from typing_extensions import override
+
+from ..utils import log
+
+if TYPE_CHECKING:
+  from FlagEmbedding import BGEM3FlagModel
+
+
+import functools
+
+from ..schema import Item
+from ..signal import TextEmbeddingSignal
+from ..splitters.spacy_splitter import clustering_spacy_chunker
+from ..tasks import TaskExecutionType
+from .embedding import chunked_compute_embedding
+from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE
+
+# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
+BGE_M3 = 'BAAI/bge-m3'
+
+
+@functools.cache
+def _get_and_cache_bge_m3(model_name: str) -> 'BGEM3FlagModel':
+  try:
+    from FlagEmbedding import BGEM3FlagModel
+  except ImportError:
+    raise ImportError(
+      'Could not import the "FlagEmbedding" python package. '
+      'Please install it with `pip install "lilac[bge]".'
+    )
+  model = BGEM3FlagModel(
+    'BAAI/bge-m3', use_fp16=True
+  )  # Setting use_fp16 to True speeds up computation with a slight performance degradation
+
+  log(f'[{model_name}] Using device:', model.device)
+
+  # NOTE: we don't call setup model and device here as this happens internally.
+  return model
+
+
+class BGEM3(TextEmbeddingSignal):
+  """Computes BGE-M3 embeddings.
+
+  <br>This embedding runs on-device. See the [model card](https://huggingface.co/BAAI/bge-m3)
+  for details.
+  """
+
+  name: ClassVar[str] = 'bge-m3'
+  display_name: ClassVar[str] = 'BGE-M3'
+  local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE
+  local_parallelism: ClassVar[int] = 1
+  local_strategy: ClassVar[TaskExecutionType] = 'threads'
+  supports_garden: ClassVar[bool] = False
+
+  _model_name = BGE_M3
+  _model: 'BGEM3FlagModel'
+
+  @override
+  def setup(self) -> None:
+    self._model = _get_and_cache_bge_m3(self._model_name)
+
+  @override
+  def compute(self, docs: list[str]) -> list[Optional[Item]]:
+    """Call the embedding function."""
+    # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
+    # The sentence transformer API actually does batching internally, so we pass
+    # local_batch_size * 16 to allow the library to see all the chunks at once.
+    return chunked_compute_embedding(
+      lambda docs: self._model.encode(docs)['dense_vecs'],
+      docs,
+      self.local_batch_size * 16,
+      chunker=clustering_spacy_chunker,
+    )
+
+  @override
+  def teardown(self) -> None:
+    if not hasattr(self, '_model'):
+      return
+
+    del self._model
+    gc.collect()
+
+    try:
+      import torch
+
+      torch.cuda.empty_cache()
+    except ImportError:
+      pass
diff --git a/lilac/embeddings/nomic_embed.py b/lilac/embeddings/nomic_embed.py
@@ -0,0 +1,109 @@
+"""Gegeral Text Embeddings (GTE) model. Open-source model, designed to run on device."""
+import gc
+from typing import TYPE_CHECKING, ClassVar, Optional
+
+import numpy as np
+from typing_extensions import override
+
+if TYPE_CHECKING:
+  from sentence_transformers import SentenceTransformer
+
+import functools
+
+from ..schema import Item
+from ..signal import TextEmbeddingSignal
+from ..splitters.spacy_splitter import clustering_spacy_chunker
+from ..tasks import TaskExecutionType
+from .embedding import chunked_compute_embedding
+from .transformer_utils import SENTENCE_TRANSFORMER_BATCH_SIZE, setup_model_device
+
+# See https://huggingface.co/spaces/mteb/leaderboard for leaderboard of models.
+NOMIC_EMBED = 'nomic-ai/nomic-embed-text-v1.5'
+
+
+@functools.cache
+def _get_and_cache_model(model_name: str) -> 'SentenceTransformer':
+  try:
+    from sentence_transformers import SentenceTransformer
+  except ImportError:
+    raise ImportError(
+      'Could not import the "sentence_transformers" python package. '
+      'Please install it with `pip install "sentence_transformers".'
+    )
+  return setup_model_device(SentenceTransformer(model_name, trust_remote_code=True), model_name)
+
+
+class NomicEmbed15(TextEmbeddingSignal):
+  """Computes Nomic Embeddings 1.5 full (768 dimensions).
+
+  <br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
+  for details.
+  """
+
+  name: ClassVar[str] = 'nomic-embed-1.5-768'
+  display_name: ClassVar[str] = 'Nomic Embeddings 1.5 784'
+  local_batch_size: ClassVar[int] = SENTENCE_TRANSFORMER_BATCH_SIZE
+  local_parallelism: ClassVar[int] = 1
+  local_strategy: ClassVar[TaskExecutionType] = 'threads'
+  supports_garden: ClassVar[bool] = False
+
+  _model_name = NOMIC_EMBED
+  _model: 'SentenceTransformer'
+  _matryoshka_dim = 768
+
+  @override
+  def setup(self) -> None:
+    self._model = _get_and_cache_model(self._model_name)
+
+  @override
+  def compute(self, docs: list[str]) -> list[Optional[Item]]:
+    """Call the embedding function."""
+    try:
+      import torch.nn.functional as F
+    except ImportError:
+      raise ImportError(
+        'Could not import the "sentence_transformers" python package. '
+        'Please install it with `pip install "sentence_transformers".'
+      )
+
+    def _encode(doc: list[str]) -> list[np.ndarray]:
+      embeddings = self._model.encode(doc, convert_to_tensor=True)
+      # Extract the dense vectors from the model.
+      embeddings = F.layer_norm(embeddings, normalized_shape=(embeddings.shape[1],))
+      embeddings = embeddings[:, : self._matryoshka_dim]
+      return embeddings.cpu().numpy()
+
+    # While we get docs in batches of 1024, the chunker expands that by a factor of 3-10.
+    # The sentence transformer API actually does batching internally, so we pass
+    # local_batch_size * 16 to allow the library to see all the chunks at once.
+    return chunked_compute_embedding(
+      _encode, docs, self.local_batch_size * 16, chunker=clustering_spacy_chunker
+    )
+
+  @override
+  def teardown(self) -> None:
+    if not hasattr(self, '_model'):
+      return
+
+    self._model.cpu()
+    del self._model
+    gc.collect()
+
+    try:
+      import torch
+
+      torch.cuda.empty_cache()
+    except ImportError:
+      pass
+
+
+class NomicEmbed15_256(NomicEmbed15):
+  """Computes Nomic Embeddings 1.5 (256 dimensions).
+
+  <br>This embedding runs on-device. See the [model card](https://huggingface.co/nomic-ai/nomic-embed-text-v1.5)
+  for details.
+  """
+
+  name: ClassVar[str] = 'nomic-embed-1.5-256'
+  display_name: ClassVar[str] = 'Nomic Embeddings 1.5 256'
+  _matryoshka_dim = 256
diff --git a/lilac/signals/default_signals.py b/lilac/signals/default_signals.py
@@ -1,8 +1,10 @@
 """Registers all available default signals."""
 
+from ..embeddings.bge import BGEM3
 from ..embeddings.cohere import Cohere
 from ..embeddings.gte import GTEBase, GTESmall, GTETiny
 from ..embeddings.jina import JinaV2Base, JinaV2Small
+from ..embeddings.nomic_embed import NomicEmbed15, NomicEmbed15_256
 from ..embeddings.openai import OpenAIEmbedding
 from ..embeddings.sbert import SBERT
 from ..signal import register_signal
@@ -43,3 +45,7 @@ def register_default_signals() -> None:
 
   register_signal(JinaV2Small, exists_ok=True)
   register_signal(JinaV2Base, exists_ok=True)
+
+  register_signal(BGEM3, exists_ok=True)
+  register_signal(NomicEmbed15, exists_ok=True)
+  register_signal(NomicEmbed15_256, exists_ok=True)
diff --git a/mypy.ini b/mypy.ini
@@ -159,3 +159,8 @@ follow_imports = skip
 [mypy-hdbscan.*]
 ignore_missing_imports = True
 follow_imports = skip
+
+
+[mypy-FlagEmbedding.*]
+ignore_missing_imports = True
+follow_imports = skip