Skip to content

Commit

Permalink
Merge pull request #97 from mit-submit/main
Browse files Browse the repository at this point in the history
8.01 release, final before given to students
  • Loading branch information
julius-heitkoetter authored Oct 13, 2023
2 parents 8170f20 + e1b0669 commit 623c229
Showing 1 changed file with 6 additions and 2 deletions.
8 changes: 6 additions & 2 deletions A2rchi/utils/data_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import hashlib
import os
import yaml
import time


class DataManager():
Expand Down Expand Up @@ -215,13 +216,16 @@ def _add_to_vectorstore(self, collection, files_to_add, sources={}):
metadata["filename"] = filename

# create unique id for each chunk
# the first 12 bits of the id being the filename and the other 6 based on the chunk itself
# the first 12 bits of the id being the filename, 6 more based on the chunk itself, and the last 6 hashing the time
ids = []
for chunk in chunks:
identifier = hashlib.md5()
identifier.update(chunk.encode('utf-8'))
chunk_hash = str(int(identifier.hexdigest(),16))[0:6]
ids.append(str(filehash) + str(chunk_hash))
time_identifier = hashlib.md5()
time_identifier.update(str(time.time()).encode('utf-8'))
time_hash = str(int(identifier.hexdigest(),16))[0:6]
ids.append(str(filehash) + str(chunk_hash) + str(time_hash))

print("Ids: ",ids)
collection.add(embeddings=embeddings, ids=ids, documents=chunks, metadatas=metadatas)
Expand Down

0 comments on commit 623c229

Please sign in to comment.