Skip to content

Commit

Permalink
Add LanceDB Vector store (#115)
Browse files Browse the repository at this point in the history
  • Loading branch information
AyushExel authored Aug 14, 2023
1 parent 339169d commit c26912a
Show file tree
Hide file tree
Showing 9 changed files with 362 additions and 24 deletions.
10 changes: 5 additions & 5 deletions autochain/memory/long_term_memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@
from autochain.tools.internal_search.base_search_tool import BaseSearchTool
from autochain.tools.internal_search.chromadb_tool import ChromaDBSearch, ChromaDoc
from autochain.tools.internal_search.pinecone_tool import PineconeSearch, PineconeDoc
from autochain.tools.internal_search.lancedb_tool import LanceDBSeach, LanceDBDoc

SEARCH_PROVIDERS = (ChromaDBSearch, PineconeSearch, LanceDBSeach)
SEARCH_DOC_TYPES = (ChromaDoc, PineconeDoc, LanceDBDoc)

class LongTermMemory(BaseMemory):
"""Buffer for storing conversation memory and an in-memory kv store."""
Expand All @@ -22,10 +25,7 @@ class LongTermMemory(BaseMemory):
long_term_memory: BaseSearchTool = None

class Config:
keep_untouched = (
ChromaDBSearch,
PineconeSearch,
)
keep_untouched = SEARCH_PROVIDERS

def load_memory(
self,
Expand All @@ -50,7 +50,7 @@ def save_memory(self, key: str, value: Any) -> None:
if (
isinstance(value, list)
and len(value) > 0
and (isinstance(value[0], ChromaDoc) or isinstance(value[0], PineconeDoc))
and (isinstance(value[0], SEARCH_DOC_TYPES))
):
self.long_term_memory.add_docs(docs=value)
elif key:
Expand Down
91 changes: 91 additions & 0 deletions autochain/tools/internal_search/lancedb_tool.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from typing import List, Any
from dataclasses import dataclass

import lancedb
import pandas as pd

from autochain.tools.base import Tool
from autochain.models.base import BaseLanguageModel
from autochain.tools.internal_search.base_search_tool import BaseSearchTool

@dataclass
class LanceDBDoc:
doc: str
vector: List[float] = None

class LanceDBSeach(Tool, BaseSearchTool):
"""
Use LanceDB as the internal search tool
LanceDB is a vector database that supports vector search.
Args:
uri: the uri of the database. Default to "lancedb"
table_name: the name of the table. Default to "table"
metric: the metric used for vector search. Default to "cosine"
encoder: the encoder used to encode the documents. Default to None
docs: the documents to be indexed. Default to None
"""
class Config:
"""Configuration for this pydantic object."""

arbitrary_types_allowed = True

docs: List[LanceDBDoc]
uri: str = "lancedb"
table_name: str = "table"
metric: str = "cosine"
encoder: BaseLanguageModel = None
db: lancedb.db.DBConnection = None
table: lancedb.table.Table = None
def __init__(self, **kwargs) -> None:
super().__init__(**kwargs)
self.db = lancedb.connect(self.uri)
if self.docs:
self._encode_docs(self.docs)
self._create_table(self.docs)

def _create_table(self, docs: List[LanceDBDoc]) -> None:
self.table = self.db.create_table(self.table_name, self._docs_to_dataframe(docs), mode="overwrite")

def _encode_docs(self, docs: List[LanceDBDoc]) -> None:
for doc in docs:
if not doc.vector:
if not self.encoder:
raise ValueError("Encoder is not provided for encoding docs")
doc.vector = self.encoder.encode([doc.doc]).embeddings[0]

def _docs_to_dataframe(self, docs: List[LanceDBDoc]) -> pd.DataFrame:
return pd.DataFrame(
[
{"doc": doc.doc, "vector": doc.vector}
for doc in docs
]
)

def _run(
self,
query: str,
top_k: int = 2,
*args: Any,
**kwargs: Any,
) -> str:
if self.table is None:
return ""

embeddings = self.encoder.encode([query]).embeddings[0]
result = self.table.search(embeddings).limit(top_k).to_df()["doc"].to_list()

return "\n".join([f"Doc {i}: {doc}" for i, doc in enumerate(result)])

def add_docs(self, docs: List[LanceDBDoc], **kwargs):
if not len(docs):
return

self._encode_docs(docs)
self.table.add(self._docs_to_dataframe(docs)) if self.table else self._create_table(docs)

def clear_index(self):
if self.table_name in self.db.table_names():
self.db.drop_table(self.table_name)
self.table = None
2 changes: 1 addition & 1 deletion docs/memory.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ default type of memory AutoChain uses in examples and evaluation.

In the case there are a lot of information need to be stored and only a small part of it is
needed during the planning step, `LongTermMemory` enables agents to retrieve partial memory
with internal search tool, such as `ChromaDBSearch` and `PineconeSearch`. Search query is the
with internal search tool, such as `ChromaDBSearch`, `PineconeSearch`, `LanceDBSearch`. Search query is the
key of the store, and it still follow the same interface as other memory implementations. Both
would encode the text into vector DB and retrieve using the search query.

Expand Down
7 changes: 6 additions & 1 deletion docs/tool.md
Original file line number Diff line number Diff line change
Expand Up @@ -47,4 +47,9 @@ long term memory for the agent
### ChromaDBTool
Internal search tool that can be used for long term memory of the agent or looking up relevant
information that does not exists from the Internet. Currently, AutoChain supports `ChromaDB` as
long term memory for the agent
long term memory for the agent.

### LanceDBTool
Internal search tool that can be used for long term memory of the agent or looking up relevant
information that does not exists from the Internet. Currently, AutoChain supports `ChromaDB` as
long term memory for the agent. LanceDBTool is serverless, and does not require any setup.
Loading

0 comments on commit c26912a

Please sign in to comment.