Add LanceDB Vector store (#115)

Forethought-Technologies · Aug 14, 2023 · c26912a · c26912a
1 parent 339169d
commit c26912a
Show file tree

Hide file tree

Showing 9 changed files with 362 additions and 24 deletions.
diff --git a/autochain/memory/long_term_memory.py b/autochain/memory/long_term_memory.py
@@ -12,7 +12,10 @@
 from autochain.tools.internal_search.base_search_tool import BaseSearchTool
 from autochain.tools.internal_search.chromadb_tool import ChromaDBSearch, ChromaDoc
 from autochain.tools.internal_search.pinecone_tool import PineconeSearch, PineconeDoc
+from autochain.tools.internal_search.lancedb_tool import LanceDBSeach, LanceDBDoc
 
+SEARCH_PROVIDERS = (ChromaDBSearch, PineconeSearch, LanceDBSeach)
+SEARCH_DOC_TYPES = (ChromaDoc, PineconeDoc, LanceDBDoc)
 
 class LongTermMemory(BaseMemory):
     """Buffer for storing conversation memory and an in-memory kv store."""
@@ -22,10 +25,7 @@ class LongTermMemory(BaseMemory):
     long_term_memory: BaseSearchTool = None
 
     class Config:
-        keep_untouched = (
-            ChromaDBSearch,
-            PineconeSearch,
-        )
+        keep_untouched = SEARCH_PROVIDERS
 
     def load_memory(
         self,
@@ -50,7 +50,7 @@ def save_memory(self, key: str, value: Any) -> None:
         if (
             isinstance(value, list)
             and len(value) > 0
-            and (isinstance(value[0], ChromaDoc) or isinstance(value[0], PineconeDoc))
+            and (isinstance(value[0], SEARCH_DOC_TYPES))
         ):
             self.long_term_memory.add_docs(docs=value)
         elif key:

diff --git a/autochain/tools/internal_search/lancedb_tool.py b/autochain/tools/internal_search/lancedb_tool.py
@@ -0,0 +1,91 @@
+from typing import List, Any
+from dataclasses import dataclass
+
+import lancedb
+import pandas as pd
+
+from autochain.tools.base import Tool
+from autochain.models.base import BaseLanguageModel
+from autochain.tools.internal_search.base_search_tool import BaseSearchTool
+
+@dataclass
+class LanceDBDoc:
+    doc: str
+    vector: List[float] = None
+
+class LanceDBSeach(Tool, BaseSearchTool):
+    """
+    Use LanceDB as the internal search tool
+
+    LanceDB is a vector database that supports vector search.
+
+    Args:
+        uri: the uri of the database. Default to "lancedb"
+        table_name: the name of the table. Default to "table"
+        metric: the metric used for vector search. Default to "cosine"
+        encoder: the encoder used to encode the documents. Default to None
+        docs: the documents to be indexed. Default to None
+    """
+    class Config:
+        """Configuration for this pydantic object."""
+
+        arbitrary_types_allowed = True
+
+    docs: List[LanceDBDoc]
+    uri: str = "lancedb"
+    table_name: str = "table"
+    metric: str = "cosine"
+    encoder: BaseLanguageModel = None
+    db: lancedb.db.DBConnection = None
+    table: lancedb.table.Table = None
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.db = lancedb.connect(self.uri)
+        if self.docs:
+            self._encode_docs(self.docs)
+            self._create_table(self.docs)
+
+    def _create_table(self, docs: List[LanceDBDoc]) -> None:
+        self.table = self.db.create_table(self.table_name, self._docs_to_dataframe(docs), mode="overwrite")
+
+    def _encode_docs(self, docs: List[LanceDBDoc]) -> None:
+        for doc in docs:
+            if not doc.vector:
+                if not self.encoder:
+                    raise ValueError("Encoder is not provided for encoding docs")
+                doc.vector = self.encoder.encode([doc.doc]).embeddings[0]
+
+    def _docs_to_dataframe(self, docs: List[LanceDBDoc]) -> pd.DataFrame:
+        return pd.DataFrame(
+            [
+                {"doc": doc.doc, "vector": doc.vector}
+                for doc in docs
+            ]
+        )
+
+    def _run(
+        self,
+        query: str,
+        top_k: int = 2,
+        *args: Any,
+        **kwargs: Any,
+    ) -> str:
+        if self.table is None:
+            return ""
+
+        embeddings = self.encoder.encode([query]).embeddings[0]
+        result = self.table.search(embeddings).limit(top_k).to_df()["doc"].to_list()
+
+        return  "\n".join([f"Doc {i}: {doc}" for i, doc in enumerate(result)])
+
+    def add_docs(self, docs: List[LanceDBDoc], **kwargs):
+        if not len(docs):
+            return
+
+        self._encode_docs(docs)
+        self.table.add(self._docs_to_dataframe(docs)) if self.table else self._create_table(docs)
+
+    def clear_index(self):
+        if self.table_name in self.db.table_names():
+            self.db.drop_table(self.table_name)
+        self.table = None
diff --git a/docs/memory.md b/docs/memory.md
@@ -41,7 +41,7 @@ default type of memory AutoChain uses in examples and evaluation.
 
 In the case there are a lot of information need to be stored and only a small part of it is
 needed during the planning step, `LongTermMemory` enables agents to retrieve partial memory
-with internal search tool, such as `ChromaDBSearch` and `PineconeSearch`. Search query is the 
+with internal search tool, such as `ChromaDBSearch`, `PineconeSearch`, `LanceDBSearch`. Search query is the 
 key of the store, and it still follow the same interface as other memory implementations. Both 
 would encode the text into vector DB and retrieve using the search query.
 

diff --git a/docs/tool.md b/docs/tool.md
@@ -47,4 +47,9 @@ long term memory for the agent
 ### ChromaDBTool
 Internal search tool that can be used for long term memory of the agent or looking up relevant
 information that does not exists from the Internet. Currently, AutoChain supports `ChromaDB` as
-long term memory for the agent
+long term memory for the agent.
+
+### LanceDBTool
+Internal search tool that can be used for long term memory of the agent or looking up relevant
+information that does not exists from the Internet. Currently, AutoChain supports `ChromaDB` as
+long term memory for the agent. LanceDBTool is serverless, and does not require any setup.