CR

jerpint · Nov 29, 2023 · 7c798db · 7c798db
1 parent 21b6b34
commit 7c798db
Show file tree

Hide file tree

Showing 2 changed files with 10 additions and 0 deletions.
diff --git a/buster/documents_manager/base.py b/buster/documents_manager/base.py
@@ -75,6 +75,8 @@ def add(
             num_workers (int, optional): The number of parallel workers to use for computing embeddings. Default is 32.
             embedding_fn (callable, optional): A function that computes embeddings for a given input string.
                 Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
+            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
+                Default is None. Only use if you want sparse embeddings.
 
             csv_filename: (str, optional) = Path to save a copy of the dataframe with computed embeddings for later use.
             csv_overwrite: (bool, optional) = If csv_filename is specified, whether to overwrite the file with a new file.
@@ -121,6 +123,8 @@ def batch_add(
                                         Defaults to 32.
             embedding_fn (callable, optional): A function that computes embeddings for a given input string.
                 Default is 'get_embedding_openai' which uses the text-embedding-ada-002 model.
+            sparse_embedding_fn (callable, optional): A function that computes sparse embeddings for a given input string.
+                Default is None. Only use if you want sparse embeddings.
             csv_filename: (str, optional) = Path to save a copy of the dataframe with computed embeddings for later use.
             csv_overwrite: (bool, optional) = If csv_filename is specified, whether to overwrite the file with a new file.
                 When using batches, set to False to keep all embeddings in the same file. You may want to manually remove the file if experimenting.

diff --git a/buster/documents_manager/service.py b/buster/documents_manager/service.py
@@ -46,6 +46,8 @@ def _add_documents(self, df: pd.DataFrame):
         """Write all documents from the dataframe into the db as a new version."""
 
         use_sparse_vector = "sparse_embedding" in df.columns
+        if use_sparse_vector:
+            logger.info("Uploading sparse embeddings too.")
 
         for source in df.source.unique():
             source_exists = self.db.sources.find_one({"name": source})
@@ -74,6 +76,10 @@ def _add_documents(self, df: pd.DataFrame):
 
                 to_upsert.append(vector)
 
+            # Current (November 2023) Pinecone upload rules:
+            # - Max 1000 vectors per batch
+            # - Max 2 MB per batch
+            # Sparse vectors are heavier, so we reduce the batch size when using them.
             MAX_PINECONE_BATCH_SIZE = 100 if use_sparse_vector else 1000
             for i in range(0, len(to_upsert), MAX_PINECONE_BATCH_SIZE):
                 self.index.upsert(vectors=to_upsert[i : i + MAX_PINECONE_BATCH_SIZE], namespace=self.namespace)