Skip to content

Commit

Permalink
Fix file paths and add missing variable reference
Browse files Browse the repository at this point in the history
  • Loading branch information
sdan committed Mar 31, 2024
1 parent 5dea5de commit e79b2cd
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 21 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/unit-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,5 +20,5 @@ jobs:
- name: Run tests
run: |
echo "Starting tests..."
python ./tests/unit.py
c
echo "Tests completed."
8 changes: 4 additions & 4 deletions tests/unit.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def test_add_texts(self):
text_512tokens = "underreckoning fleckiness hairstane paradigmatic eligibility sublevate xviii achylia reremice flung outpurl questing gilia unosmotic unsuckled plecopterid excludable phenazine fricando unfledgedness spiritsome incircle desmogenous subclavate redbug semihoral district chrysocolla protocoled servius readings propolises javali dujan stickman attendee hambone obtusipennate tightropes monitorially signaletics diestrums preassigning spriggy yestermorning margaritic tankfuls aseptify linearity hilasmic twinning tokonoma seminormalness cerebrospinant refroid doghouse kochab dacryocystalgia saltbushes newcomer provoker berberid platycoria overpersuaded reoverflow constrainable headless forgivably syzygal purled reese polyglottonic decennary embronze pluripotent equivocally myoblasts thymelaeaceous confervae perverted preanticipate mammalogical desalinizing tackets misappearance subflexuose concludence effluviums runtish gras cuckolded hemostasia coatroom chelidon policizer trichinised frontstall impositions unta outrance scholium fibrochondritis furcates fleaweed housefront helipads hemachate snift appellativeness knobwood superinclination tsures haberdasheries unparliamented reexecution nontangential waddied desolated subdistinctively undiscernibleness swishiest dextral progs koprino bruisingly unloanably bardash uncuckoldedunderreckoning fleckiness hairstane paradigmatic eligibility sublevate xviii achylia reremice flung outpurl questing gilia unosmotic unsuckled plecopterid excludable phenazine fricando unfledgedness spiritsome incircle desmogenous subclavate redbug semihoral district chrysocolla spriggy yestermorning margaritic tankfuls aseptify linearity hilasmic twinning tokonoma seminormalness cerebrospinant refroequivocally myoblasts thymelaeaceous confervae perverted preantiest dextral progs koprino bruisingly unloanably bardash uncuckolded"
metadata = {"source": "test_512tokens"}
self.vlite.add(text_512tokens, metadata=metadata)
with open("data/text-8192tokens.txt", "r") as file:
with open(os.path.join(os.path.dirname(__file__), "data/text-8192tokens.txt"), "r") as file:
text_8192tokens = file.read()
metadata = {"source": "test_8192tokens"}
self.vlite.add(text_8192tokens, metadata=metadata)
Expand All @@ -45,7 +45,7 @@ def test_add_texts(self):

def test_add_pdf(self):
start_time = time.time()
process_pdf('data/gpt-4.pdf')
process_pdf(os.path.join(os.path.dirname(__file__), 'data/gpt-4.pdf'))
end_time = time.time()
TestVLite.test_times["add_pdf"] = end_time - start_time
# time to add 71067 tokens from the GPT-4 paper
Expand All @@ -69,9 +69,9 @@ def test_retrieve(self):
"How does the GPT-4 handle tokenization?",
"What are the novel contributions of the GPT-4 model?"
]
process_pdf('data/gpt-4.pdf')
process_pdf(os.path.join(os.path.dirname(__file__), 'data/gpt-4.pdf'))
start_time = time.time()
for query in self.queries:
for query in queries:
_, top_sims, _ = self.vlite.retrieve(query)
print(f"Top similarities for query '{query}': {top_sims}")
end_time = time.time()
Expand Down
32 changes: 16 additions & 16 deletions vlite/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,46 +38,46 @@ def __init__(self, collection=None, device='cpu', model_name='mixedbread-ai/mxba
self.metadata = {} # Dictionary to store metadata
self.vectors = np.empty((0, self.model.dimension)) # Empty array to store embedding vectors


def add(self, data, metadata=None):
def add(self, data, metadata=None, id=None):
"""
Adds text or a list of texts to the collection with optional ID and metadata.
Args:
data (str, dict, or list): Text data to be added. Can be a string, a dictionary
containing text, id, and/or metadata, or a list of strings or dictionaries.
data (str, dict, or list): Text data to be added. Can be a string, a dictionary containing text, id, and/or metadata, or a list of strings or dictionaries.
metadata (dict, optional): Additional metadata to be appended to each text entry.
id (str, optional): Unique identifier for the text entry. If not provided, a UUID will be generated.
Returns:
list: A list of tuples, each containing the ID of the added text and the updated vectors array.
"""
print("Adding text to the collection...")

data = [data] if not isinstance(data, list) else data

results = []

for item in data:
text_content, id, item_metadata = (
(item['text'], item.get('id', str(uuid4())), item.get('metadata', {}))
if isinstance(item, dict)
else (item, str(uuid4()), {})
)
if isinstance(item, dict):
text_content = item['text']
item_id = item.get('id', str(uuid4()))
item_metadata = item.get('metadata', {})
else:
text_content = item
item_id = id or str(uuid4())
item_metadata = {}

item_metadata.update(metadata or {})

chunks = chop_and_chunk(text_content)
encoded_data = self.model.embed(chunks, device=self.device)
self.vectors = np.vstack((self.vectors, encoded_data))

update_metadata = lambda idx: {
**self.metadata.get(idx, {}),
**item_metadata,
'index': id
'index': item_id
}
self.metadata.update({idx: update_metadata(idx) for idx in range(len(self.texts), len(self.texts) + len(chunks))})

self.metadata.update({idx: update_metadata(idx) for idx in range(len(self.texts), len(self.texts) + len(chunks))})
self.texts.extend(chunks)
results.append((id, self.vectors))
results.append((item_id, self.vectors))

self.save()
print("Text added successfully.")
Expand Down Expand Up @@ -134,7 +134,7 @@ def update(self, id, text, metadata=None):
"""
print(f"Updating text with ID: {id}")
self.delete(id)
self.add(text, id, metadata)
self.add(text, metadata=metadata, id=id)

def get(self, ids=None, where=None):
"""
Expand Down

0 comments on commit e79b2cd

Please sign in to comment.