From e640f7c2a6eee2230fde5702a5c76bfc0ca63b70 Mon Sep 17 00:00:00 2001 From: mdr223 Date: Fri, 8 Sep 2023 12:39:00 -0400 Subject: [PATCH] trying to fix vectorstore being empty --- A2rchi/chains/chain.py | 1 + A2rchi/utils/data_manager.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/A2rchi/chains/chain.py b/A2rchi/chains/chain.py index 0ed1233..a959fdc 100644 --- a/A2rchi/chains/chain.py +++ b/A2rchi/chains/chain.py @@ -66,6 +66,7 @@ def __init__(self): def update_vectorstore(self): + self.dataManager.update_vectorstore() self.vectorstore = self.dataManager.fetch_vectorstore() self.chain = BaseChain.from_llm(self.llm, self.vectorstore.as_retriever(), return_source_documents=True) diff --git a/A2rchi/utils/data_manager.py b/A2rchi/utils/data_manager.py index 788300f..f5f545a 100644 --- a/A2rchi/utils/data_manager.py +++ b/A2rchi/utils/data_manager.py @@ -55,7 +55,8 @@ def update_vectorstore(self): # remove obsolete files files_to_remove = list(set(files_in_vstore) - set(files_in_data)) ids_to_remove = [id for id, file in zip(ids_in_vstore, files_in_vstore) if file in files_to_remove] - vstore._collection.delete(ids_to_remove) + if ids_to_remove: + vstore._collection.delete(ids_to_remove) # add new files to vectorstore; will do nothing if files_to_add is empty files_to_add = list(set(files_in_data) - set(files_in_vstore)) @@ -70,7 +71,7 @@ def loader(self, file_path): # return the document loader from a path, with the correct loader given the extension _, file_extension = os.path.splitext(file_path) if file_extension == ".txt" : return TextLoader(file_path) - elif file_extension == ".html" : return BSHTMLLoader(file_path) + elif file_extension == ".html" : return BSHTMLLoader(file_path, bs_kwargs={"features": "html.parser"}) elif file_extension == ".pdf" : return PyPDFLoader(file_path) else: print(file_path, " Error: format not supported")