From 55e19a05e499b1c69fa137765cb99350027d06c3 Mon Sep 17 00:00:00 2001 From: julius-heitkoetter <97237339+julius-heitkoetter@users.noreply.github.com> Date: Fri, 13 Oct 2023 09:16:42 -0400 Subject: [PATCH 1/2] have id's for chunks also include a hash for timestamp --- A2rchi/utils/data_manager.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/A2rchi/utils/data_manager.py b/A2rchi/utils/data_manager.py index 5173684..3328647 100644 --- a/A2rchi/utils/data_manager.py +++ b/A2rchi/utils/data_manager.py @@ -215,13 +215,16 @@ def _add_to_vectorstore(self, collection, files_to_add, sources={}): metadata["filename"] = filename # create unique id for each chunk - # the first 12 bits of the id being the filename and the other 6 based on the chunk itself + # the first 12 bits of the id being the filename, 6 more based on the chunk itself, and the last 6 hashing the time ids = [] for chunk in chunks: identifier = hashlib.md5() identifier.update(chunk.encode('utf-8')) chunk_hash = str(int(identifier.hexdigest(),16))[0:6] - ids.append(str(filehash) + str(chunk_hash)) + time_identifier = hashlib.md5() + time_identifier.update(str(time.time()).encode('utf-8')) + time_hash = str(int(identifier.hexdigest(),16))[0:6] + ids.append(str(filehash) + str(chunk_hash) + str(time_hash)) print("Ids: ",ids) collection.add(embeddings=embeddings, ids=ids, documents=chunks, metadatas=metadatas) From 01b14cc3c73fe41b7bf453eb79e97483c236cbb6 Mon Sep 17 00:00:00 2001 From: julius-heitkoetter <97237339+julius-heitkoetter@users.noreply.github.com> Date: Fri, 13 Oct 2023 09:17:53 -0400 Subject: [PATCH 2/2] Adding time import --- A2rchi/utils/data_manager.py | 1 + 1 file changed, 1 insertion(+) diff --git a/A2rchi/utils/data_manager.py b/A2rchi/utils/data_manager.py index 3328647..4245f20 100644 --- a/A2rchi/utils/data_manager.py +++ b/A2rchi/utils/data_manager.py @@ -11,6 +11,7 @@ import hashlib import os import yaml +import time class DataManager():