diff --git a/tests/omnoms/vlite-unit.omom b/tests/omnoms/vlite-unit.omom new file mode 100644 index 0000000..e86a98c Binary files /dev/null and b/tests/omnoms/vlite-unit.omom differ diff --git a/tests/unit.py b/tests/unit.py index 9a28175..d2ac261 100644 --- a/tests/unit.py +++ b/tests/unit.py @@ -105,6 +105,9 @@ def tearDownClass(cls): if os.path.exists('vlite-unit.npz'): print("[+] Removing vlite") os.remove('vlite-unit.npz') + if os.path.exists('vlite-unit.omom'): + print("[+] Removing vlite") + os.remove('vlite-unit.omom') if __name__ == '__main__': unittest.main(verbosity=2) \ No newline at end of file diff --git a/vlite/main.py b/vlite/main.py index ece5788..2a898d3 100644 --- a/vlite/main.py +++ b/vlite/main.py @@ -3,60 +3,35 @@ from .model import EmbeddingModel from .utils import chop_and_chunk import datetime +from .omom import Omom class VLite: - """ - A simple vector database for text embedding and retrieval. - - Attributes: - collection (str): Path to the collection file. - device (str): Device to use for embedding ('cpu' or 'cuda'). - model (EmbeddingModel): The embedding model used for text representation. - - Methods: - add(text, id=None, metadata=None): Adds a text to the collection with optional ID and metadata. - retrieve(text=None, id=None, top_k=5): Retrieves similar texts from the collection. - save(): Saves the collection to a file. - """ def __init__(self, collection=None, device='cpu', model_name='mixedbread-ai/mxbai-embed-large-v1'): if collection is None: current_datetime = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") collection = f"vlite_{current_datetime}" - self.collection = f"{collection}.npz" + self.collection = f"{collection}" self.device = device self.model = EmbeddingModel(model_name) if model_name else EmbeddingModel() + + self.omom = Omom() + self.index = {} try: - with np.load(self.collection, allow_pickle=True) as data: - index_data = data['index'].item() + with self.omom.read(collection) as omom_file: self.index = { chunk_id: { 'text': chunk_data['text'], 'metadata': chunk_data['metadata'], - 'vector': np.array(chunk_data['vector']), # Convert back to numpy array - 'binary_vector': np.array(chunk_data['binary_vector']), # Convert back to numpy array - 'int8_vector': np.array(chunk_data['int8_vector']) # Convert back to numpy array + 'binary_vector': np.array(chunk_data['binary_vector']) } - for chunk_id, chunk_data in index_data.items() + for chunk_id, chunk_data in omom_file.metadata.items() } except FileNotFoundError: print(f"Collection file {self.collection} not found. Initializing empty attributes.") - self.index = {} - - def add(self, data, metadata=None, need_chunks=True, newEmbedding=False, fast=True): - """ - Adds text or a list of texts to the collection with optional ID within metadata. - - Args: - data (str, dict, or list): Text data to be added. Can be a string, a dictionary containing text, id, and/or metadata, or a list of strings or dictionaries. - metadata (dict, optional): Additional metadata to be appended to each text entry. - need_chunks (bool, optional): Whether to split the text into chunks before embedding. Defaults to True. - fast (bool, optional): Whether to use fast mode for chunking. Defaults to True. - - Returns: - list: A list of tuples, each containing the ID of the added text and the updated vectors array. - """ - print("Adding text to the collection...") + + def add(self, data, metadata=None, need_chunks=True, fast=True): + print("Adding text to the collection...", self.collection) data = [data] if not isinstance(data, list) else data results = [] all_chunks = [] @@ -88,48 +63,33 @@ def add(self, data, metadata=None, need_chunks=True, newEmbedding=False, fast=Tr encoded_data = self.model.embed(all_chunks, device=self.device) binary_encoded_data = self.model.quantize(encoded_data, precision="binary") - int8_encoded_data = self.model.quantize(encoded_data, precision="int8") - - for idx, (chunk, vector, binary_vector, int8_vector, metadata, item_id) in enumerate(zip(all_chunks, encoded_data, binary_encoded_data, int8_encoded_data, all_metadata, all_ids)): + + for idx, (chunk, binary_vector, metadata, item_id) in enumerate(zip(all_chunks, binary_encoded_data, all_metadata, all_ids)): chunk_id = f"{item_id}_{idx}" self.index[chunk_id] = { 'text': chunk, 'metadata': metadata, - 'vector': vector, - 'binary_vector': binary_vector.tolist(), - 'int8_vector': int8_vector.tolist() + 'binary_vector': binary_vector.tolist() } if item_id not in [result[0] for result in results]: - results.append((item_id, encoded_data, metadata)) + results.append((item_id, binary_encoded_data, metadata)) self.save() print("Text added successfully.") return results - def retrieve(self, text=None, top_k=5, metadata=None, newEmbedding=False): - """ - Retrieves similar texts from the collection based on text content, ID, or metadata. - - Args: - text (str, optional): Query text for finding similar texts. - top_k (int, optional): Number of top similar texts to retrieve. Defaults to 5. - metadata (dict, optional): Metadata to filter the retrieved texts. - - Returns: - tuple: A tuple containing a list of similar texts, their similarity scores, and metadata (if applicable). - """ + def retrieve(self, text=None, top_k=5, metadata=None): print("Retrieving similar texts...") if text: print(f"Retrieving top {top_k} similar texts for query: {text}") query_chunks = chop_and_chunk(text, fast=True) query_vectors = self.model.embed(query_chunks, device=self.device) query_binary_vectors = self.model.quantize(query_vectors, precision="binary") - query_int8_vectors = self.model.quantize(query_vectors, precision="int8") results = [] - for query_binary_vector, query_int8_vector in zip(query_binary_vectors, query_int8_vectors): - chunk_results = self.rescore(query_binary_vector, query_int8_vector, top_k, metadata) + for query_binary_vector in query_binary_vectors: + chunk_results = self.search(query_binary_vector, top_k, metadata) results.extend(chunk_results) results.sort(key=lambda x: x[1], reverse=True) @@ -138,59 +98,31 @@ def retrieve(self, text=None, top_k=5, metadata=None, newEmbedding=False): print("Retrieval completed.") return [(self.index[idx]['text'], score, self.index[idx]['metadata']) for idx, score in results] - def rescore(self, query_binary_vector, query_int8_vector, top_k, metadata=None): - """ - Performs retrieval using binary search and rescoring using int8 embeddings. - - Args: - query_binary_vector (numpy.ndarray): Binary vector of the query. - query_int8_vector (numpy.ndarray): Int8 vector of the query. - top_k (int): Number of top similar texts to retrieve. - metadata (dict, optional): Metadata to filter the retrieved texts. - - Returns: - list: A list of tuples containing the chunk IDs and their similarity scores. - """ - # Reshape query_binary_vector and query_int8_vector to 1D arrays + def search(self, query_binary_vector, top_k, metadata=None): + # Reshape query_binary_vector to 1D array query_binary_vector = query_binary_vector.reshape(-1) - query_int8_vector = query_int8_vector.reshape(-1) # Perform binary search binary_vectors = np.array([item['binary_vector'] for item in self.index.values()]) binary_similarities = np.einsum('i,ji->j', query_binary_vector, binary_vectors) - top_k_indices = np.argpartition(binary_similarities, -top_k*4)[-top_k*4:] + top_k_indices = np.argpartition(binary_similarities, -top_k)[-top_k:] top_k_ids = [list(self.index.keys())[idx] for idx in top_k_indices] - # Apply metadata filter on the retrieved top_k*4 items + # Apply metadata filter on the retrieved top_k items if metadata: filtered_ids = [] for item_id in top_k_ids: item_metadata = self.index[item_id]['metadata'] if all(item_metadata.get(key) == value for key, value in metadata.items()): filtered_ids.append(item_id) - top_k_ids = filtered_ids[:top_k*4] - - # Perform rescoring using int8 embeddings - int8_vectors = np.array([self.index[idx]['int8_vector'] for idx in top_k_ids]) - int8_similarities = np.einsum('i,ji->j', query_int8_vector, int8_vectors) + top_k_ids = filtered_ids[:top_k] - # Sort the results based on the int8 similarities - sorted_indices = np.argpartition(int8_similarities, -top_k)[-top_k:] - sorted_ids = np.take(top_k_ids, sorted_indices) - sorted_scores = int8_similarities[sorted_indices] + # Get the similarity scores for the top_k items + top_k_scores = binary_similarities[top_k_indices] - return list(zip(sorted_ids, sorted_scores)) + return list(zip(top_k_ids, top_k_scores)) def delete(self, ids): - """ - Deletes items from the collection by their IDs. - - Args: - ids (list or str): A single ID or a list of IDs of the items to delete. - - Returns: - int: The number of items deleted from the collection. - """ if isinstance(ids, str): ids = [ids] @@ -209,18 +141,6 @@ def delete(self, ids): return deleted_count def update(self, id, text=None, metadata=None, vector=None): - """ - Updates an item in the collection by its ID. - - Args: - id (str): The ID of the item to update. - text (str, optional): The updated text content of the item. - metadata (dict, optional): The updated metadata of the item. - vector (numpy.ndarray, optional): The updated embedding vector of the item. - - Returns: - bool: True if the item was successfully updated, False otherwise. - """ if id in self.index: if text is not None: self.index[id]['text'] = text @@ -239,40 +159,18 @@ def update(self, id, text=None, metadata=None, vector=None): return False def get(self, ids=None, where=None): - """ - Retrieves items from the collection based on IDs and/or metadata. - - Args: - ids (list, optional): List of IDs to retrieve. If provided, only items with the specified IDs will be returned. - where (dict, optional): Metadata filter to apply. Items matching the filter will be returned. - - Returns: - list: A list of retrieved items, each item being a tuple of (text, metadata). - """ if ids is not None: - # Convert ids to a set for faster membership testing id_set = set(ids) items = [(self.index[id]['text'], self.index[id]['metadata']) for id in self.index if id in id_set] else: items = [(self.index[id]['text'], self.index[id]['metadata']) for id in self.index] if where is not None: - # Filter items based on metadata items = [item for item in items if all(item[1].get(key) == value for key, value in where.items())] return items - def set(self, id, text=None, metadata=None, vector=None): - """ - Updates the attributes of an item in the collection by ID. - - Args: - id (str): ID of the item to update. - text (str, optional): Updated text content of the item. - metadata (dict, optional): Updated metadata of the item. - vector (numpy.ndarray, optional): Updated embedding vector of the item. - """ print(f"Setting attributes for item with ID: {id}") if id in self.index: if text is not None: @@ -286,48 +184,31 @@ def set(self, id, text=None, metadata=None, vector=None): print(f"Item with ID {id} not found.") def count(self): - """ - Returns the number of items in the collection. - - Returns: - int: The count of items in the collection. - """ return len(self.index) + def save(self): - """ - Saves the current state of the collection to a file. - """ print(f"Saving collection to {self.collection}") - index_data = { - chunk_id: { - 'text': chunk_data['text'], - 'metadata': chunk_data['metadata'], - 'vector': chunk_data['vector'], - 'binary_vector': chunk_data['binary_vector'], - 'int8_vector': chunk_data['int8_vector'] - } - for chunk_id, chunk_data in self.index.items() - } - with open(self.collection, 'wb') as f: - np.savez(f, index=index_data) + with self.omom.create(self.collection) as omom_file: + omom_file.set_header( + embedding_model=self.model.model_metadata['general.name'], + embedding_size=self.model.model_metadata.get('bert.embedding_length', 1024), + embedding_dtype=self.model.embedding_dtype, + context_length=self.model.model_metadata.get('bert.context_length', 512) + ) + for chunk_id, chunk_data in self.index.items(): + omom_file.add_embedding(chunk_data['binary_vector']) + omom_file.add_context(chunk_data['text']) + omom_file.add_metadata(chunk_id, chunk_data['metadata']) print("Collection saved successfully.") - def clear(self): - """ - Clears the entire collection, removing all items and resetting the attributes. - """ print("Clearing the collection...") self.index = {} - self.save() + self.omom.delete(self.collection) print("Collection cleared.") def info(self): - """ - Prints information about the collection, including the number of items, collection file path, - and the embedding model used. - """ print("Collection Information:") print(f" Items: {self.count()}") print(f" Collection file: {self.collection}") @@ -337,10 +218,4 @@ def __repr__(self): return f"VLite(collection={self.collection}, device={self.device}, model={self.model})" def dump(self): - """ - Dumps the collection data to a dictionary for serialization. - - Returns: - dict: A dictionary containing the collection data. - """ return self.index \ No newline at end of file diff --git a/vlite/model.py b/vlite/model.py index 2ab11e1..2de451d 100644 --- a/vlite/model.py +++ b/vlite/model.py @@ -9,38 +9,28 @@ class EmbeddingModel: def __init__(self, model_name='mixedbread-ai/mxbai-embed-large-v1'): hf_path = hf_hub_download(repo_id="mixedbread-ai/mxbai-embed-large-v1", filename="gguf/mxbai-embed-large-v1-f16.gguf") print(f"Downloaded model to {hf_path}") - self.model = llama_cpp.Llama(model_path=hf_path, embedding=True) - self.dimension = 1024 # hardcoded - self.max_seq_length = 512 # hardcoded + self.model_metadata = self.model.metadata + self.embedding_size = self.model_metadata.get("bert.embedding_length", 1024) + self.context_length = self.model_metadata.get("bert.context_length", 512) + self.embedding_dtype = "float32" def embed(self, texts, max_seq_length=512, device="cpu"): if isinstance(texts, str): texts = [texts] embeddings_dict = self.model.create_embedding(texts) return [item["embedding"] for item in embeddings_dict["data"]] - + def token_count(self, texts): - enc = tiktoken.get_encoding("cl100k_base") + enc = tiktoken.get_encoding("cl100k_base") tokens = 0 for text in texts: token_ids = enc.encode(text, disallowed_special=()) tokens += len(token_ids) return tokens - - def quantize(self, embeddings, precision="binary"): - """ - Quantizes the embeddings to the specified precision. - - Args: - embeddings (list or numpy.ndarray): Input embeddings to quantize. - precision (str, optional): Precision to quantize the embeddings. Can be "binary" or "int8". Defaults to "binary". - - Returns: - numpy.ndarray: Quantized embeddings. - """ - embeddings = np.array(embeddings) # Convert embeddings to a numpy array + def quantize(self, embeddings, precision="binary"): + embeddings = np.array(embeddings) if precision == "binary": return np.packbits(embeddings > 0).reshape(embeddings.shape[0], -1) elif precision == "int8": @@ -49,16 +39,4 @@ def quantize(self, embeddings, precision="binary"): raise ValueError(f"Unsupported precision: {precision}") def rescore(self, query_vector, vectors): - """ - Rescores the retrieved vectors using the query vector. - - Args: - query_vector (numpy.ndarray): Query vector for rescoring. - vectors (numpy.ndarray): Retrieved vectors to rescore. - - Returns: - numpy.ndarray: Rescored similarities. - """ - return np.dot(query_vector, vectors.T).flatten() - - \ No newline at end of file + return np.dot(query_vector, vectors.T).flatten() \ No newline at end of file diff --git a/vlite/omom.py b/vlite/omom.py new file mode 100644 index 0000000..1f3dc23 --- /dev/null +++ b/vlite/omom.py @@ -0,0 +1,149 @@ +import os +import struct +import json +from enum import Enum +from typing import List, Dict, Union +import numpy as np + +class OmomSectionType(Enum): + HEADER = 0 + EMBEDDINGS = 1 + CONTEXTS = 2 + METADATA = 3 + +class OmomFile: + MAGIC_NUMBER = b"OMOM" + VERSION = 1 + + def __init__(self, file_path): + self.file_path = file_path + self.header = { + "embedding_model": "default", + "embedding_size": 0, + "embedding_dtype": "float32", + "context_length": 0, + } + self.embeddings = [] + self.contexts = [] + self.metadata = {} + + def set_header(self, embedding_model: str, embedding_size: int, embedding_dtype: str, context_length: int): + self.header["embedding_model"] = embedding_model + self.header["embedding_size"] = embedding_size + self.header["embedding_dtype"] = embedding_dtype + self.header["context_length"] = context_length + + def add_embedding(self, embedding: List[float]): + self.embeddings.append(embedding) + + def add_context(self, context: str): + self.contexts.append(context) + + def add_metadata(self, key: str, value: Union[int, float, str]): + self.metadata[key] = value + + def save(self): + with open(self.file_path, "wb") as file: + file.write(self.MAGIC_NUMBER) + file.write(struct.pack(" OmomFile: + file_path = self.get(user) + return OmomFile(file_path) + + def read(self, user_id: str) -> OmomFile: + file_path = self.get(user_id) + return OmomFile(file_path) + + def delete(self, user_id: str): + file_path = self.get(user_id) + if os.path.exists(file_path): + os.remove(file_path) \ No newline at end of file